YARN-4837. User facing aspects of 'AM blacklisting' feature need fixing. (vinodkv via wangda)

This commit is contained in:
Wangda Tan 2016-06-07 15:06:42 -07:00
parent be34e85e68
commit 620325e816
40 changed files with 536 additions and 895 deletions

View File

@ -1,67 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.api.records;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Evolving;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.yarn.util.Records;
/**
* Specific AMBlacklistingRequest from AM to enable/disable blacklisting.
*/
@Public
@Evolving
public abstract class AMBlackListingRequest {
@Private
@Unstable
public static AMBlackListingRequest newInstance(
boolean isAMBlackListingEnabled, float disableFailureThreshold) {
AMBlackListingRequest blackListRequest = Records
.newRecord(AMBlackListingRequest.class);
blackListRequest.setBlackListingEnabled(isAMBlackListingEnabled);
blackListRequest
.setBlackListingDisableFailureThreshold(disableFailureThreshold);
return blackListRequest;
}
/**
* @return AM Blacklisting is enabled.
*/
@Public
@Evolving
public abstract boolean isAMBlackListingEnabled();
/**
* @return AM Blacklisting disable failure threshold
*/
@Public
@Evolving
public abstract float getBlackListingDisableFailureThreshold();
@Private
@Unstable
public abstract void setBlackListingEnabled(boolean isAMBlackListingEnabled);
@Private
@Unstable
public abstract void setBlackListingDisableFailureThreshold(
float disableFailureThreshold);
}

View File

@ -535,27 +535,4 @@ public abstract class ApplicationSubmissionContext {
@Public
@Unstable
public abstract void setReservationID(ReservationId reservationID);
/**
* Get AM Blacklisting request object to know whether application needs any
* specific blacklisting for AM Nodes.
*
* @return AMBlackListingRequest object which has blacklisting information.
*/
@Public
@Unstable
public abstract AMBlackListingRequest getAMBlackListRequest();
/**
* Get AM Blacklisting request object to know whether application needs any
* specific blacklisting for AM Nodes.
*
* @param blackListRequest
* object which has blacklisting information such as
* "enable/disable AM blacklisting" and "disable failure threshold".
*/
@Public
@Unstable
public abstract void setAMBlackListRequest(
AMBlackListingRequest blackListRequest);
}

View File

@ -2553,14 +2553,25 @@ public class YarnConfiguration extends Configuration {
public static final long DEFAULT_RM_NODE_LABELS_PROVIDER_FETCH_INTERVAL_MS =
30 * 60 * 1000;
public static final String AM_BLACKLISTING_ENABLED =
YARN_PREFIX + "am.blacklisting.enabled";
public static final boolean DEFAULT_AM_BLACKLISTING_ENABLED = true;
public static final String AM_BLACKLISTING_DISABLE_THRESHOLD =
YARN_PREFIX + "am.blacklisting.disable-failure-threshold";
public static final float DEFAULT_AM_BLACKLISTING_DISABLE_THRESHOLD = 0.8f;
@Private
/**
* This is a private feature that isn't supposed to be used by end-users.
*/
public static final String AM_SCHEDULING_NODE_BLACKLISTING_ENABLED =
RM_PREFIX + "am-scheduling.node-blacklisting-enabled";
@Private
public static final boolean DEFAULT_AM_SCHEDULING_NODE_BLACKLISTING_ENABLED =
true;
@Private
/**
* This is a private feature that isn't supposed to be used by end-users.
*/
public static final String AM_SCHEDULING_NODE_BLACKLISTING_DISABLE_THRESHOLD =
RM_PREFIX + "am-scheduling.node-blacklisting-disable-threshold";
@Private
public static final float
DEFAULT_AM_SCHEDULING_NODE_BLACKLISTING_DISABLE_THRESHOLD = 0.8f;
private static final String NM_SCRIPT_BASED_NODE_LABELS_PROVIDER_PREFIX =
NM_NODE_LABELS_PROVIDER_PREFIX + "script.";

View File

@ -291,10 +291,6 @@ enum ExecutionTypeProto {
OPPORTUNISTIC = 2;
}
message AMBlackListingRequestProto {
optional bool blacklisting_enabled = 1 [default = false];
optional float blacklisting_failure_threshold = 2;
}
////////////////////////////////////////////////////////////////////////
////// From AM_RM_Protocol /////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////
@ -366,7 +362,6 @@ message ApplicationSubmissionContextProto {
optional ReservationIdProto reservation_id = 15;
optional string node_label_expression = 16;
optional ResourceRequestProto am_container_resource_request = 17;
optional AMBlackListingRequestProto am_blacklisting_request = 18;
}
message LogAggregationContextProto {

View File

@ -92,6 +92,13 @@ public class TestYarnConfigurationFields extends TestConfigurationFieldsBase {
.add(YarnConfiguration.DEFAULT_AMRM_PROXY_INTERCEPTOR_CLASS_PIPELINE);
configurationPropsToSkipCompare.add(YarnConfiguration.CURATOR_LEADER_ELECTOR);
// Ignore blacklisting nodes for AM failures feature since it is still a
// "work in progress"
configurationPropsToSkipCompare.add(YarnConfiguration.
AM_SCHEDULING_NODE_BLACKLISTING_ENABLED);
configurationPropsToSkipCompare.add(YarnConfiguration.
AM_SCHEDULING_NODE_BLACKLISTING_DISABLE_THRESHOLD);
// Ignore all YARN Application Timeline Service (version 1) properties
configurationPrefixToSkipCompare.add("yarn.timeline-service.");

View File

@ -1,104 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.api.records.impl.pb;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.yarn.api.records.AMBlackListingRequest;
import org.apache.hadoop.yarn.proto.YarnProtos.AMBlackListingRequestProto;
import org.apache.hadoop.yarn.proto.YarnProtos.AMBlackListingRequestProtoOrBuilder;
import com.google.protobuf.TextFormat;
@Private
@Unstable
public class AMBlackListingRequestPBImpl extends AMBlackListingRequest {
AMBlackListingRequestProto proto = AMBlackListingRequestProto
.getDefaultInstance();
AMBlackListingRequestProto.Builder builder = null;
boolean viaProto = false;
public AMBlackListingRequestPBImpl() {
builder = AMBlackListingRequestProto.newBuilder();
}
public AMBlackListingRequestPBImpl(AMBlackListingRequestProto proto) {
this.proto = proto;
viaProto = true;
}
public AMBlackListingRequestProto getProto() {
proto = viaProto ? proto : builder.build();
viaProto = true;
return proto;
}
private void maybeInitBuilder() {
if (viaProto || builder == null) {
builder = AMBlackListingRequestProto.newBuilder(proto);
}
viaProto = false;
}
@Override
public boolean isAMBlackListingEnabled() {
AMBlackListingRequestProtoOrBuilder p = viaProto ? proto : builder;
return p.getBlacklistingEnabled();
}
@Override
public float getBlackListingDisableFailureThreshold() {
AMBlackListingRequestProtoOrBuilder p = viaProto ? proto : builder;
return p.getBlacklistingFailureThreshold();
}
@Override
public void setBlackListingEnabled(boolean isAMBlackListingEnabled) {
maybeInitBuilder();
builder.setBlacklistingEnabled(isAMBlackListingEnabled);
}
@Override
public void setBlackListingDisableFailureThreshold(
float disableFailureThreshold) {
maybeInitBuilder();
builder.setBlacklistingFailureThreshold(disableFailureThreshold);
}
@Override
public int hashCode() {
return getProto().hashCode();
}
@Override
public boolean equals(Object other) {
if (other == null) {
return false;
}
if (other.getClass().isAssignableFrom(this.getClass())) {
return this.getProto().equals(this.getClass().cast(other).getProto());
}
return false;
}
@Override
public String toString() {
return TextFormat.shortDebugString(getProto());
}
}

View File

@ -24,7 +24,6 @@ import java.util.Set;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.records.AMBlackListingRequest;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
@ -34,7 +33,6 @@ import org.apache.hadoop.yarn.api.records.ReservationId;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.proto.YarnProtos.AMBlackListingRequestProto;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationIdProto;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationSubmissionContextProto;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationSubmissionContextProtoOrBuilder;
@ -65,7 +63,6 @@ extends ApplicationSubmissionContext {
private ResourceRequest amResourceRequest = null;
private LogAggregationContext logAggregationContext = null;
private ReservationId reservationId = null;
private AMBlackListingRequest amBlackListRequest = null;
public ApplicationSubmissionContextPBImpl() {
builder = ApplicationSubmissionContextProto.newBuilder();
@ -134,10 +131,6 @@ extends ApplicationSubmissionContext {
if (this.reservationId != null) {
builder.setReservationId(convertToProtoFormat(this.reservationId));
}
if (this.amBlackListRequest != null) {
builder.setAmBlacklistingRequest(
convertToProtoFormat(this.amBlackListRequest));
}
}
private void mergeLocalToProto() {
@ -420,29 +413,6 @@ extends ApplicationSubmissionContext {
return p.getKeepContainersAcrossApplicationAttempts();
}
@Override
public AMBlackListingRequest getAMBlackListRequest() {
ApplicationSubmissionContextProtoOrBuilder p = viaProto ? proto : builder;
if (amBlackListRequest != null) {
return amBlackListRequest;
}
if (!p.hasAmBlacklistingRequest()) {
return null;
}
amBlackListRequest = convertFromProtoFormat(p.getAmBlacklistingRequest());
return amBlackListRequest;
}
@Override
public void setAMBlackListRequest(AMBlackListingRequest amBlackListRequest) {
maybeInitBuilder();
if (amBlackListRequest == null) {
builder.clearAmBlacklistingRequest();
return;
}
this.amBlackListRequest = amBlackListRequest;
}
private PriorityPBImpl convertFromProtoFormat(PriorityProto p) {
return new PriorityPBImpl(p);
}
@ -485,16 +455,6 @@ extends ApplicationSubmissionContext {
return ((ResourcePBImpl)t).getProto();
}
private AMBlackListingRequestPBImpl convertFromProtoFormat(
AMBlackListingRequestProto a) {
return new AMBlackListingRequestPBImpl(a);
}
private AMBlackListingRequestProto convertToProtoFormat(
AMBlackListingRequest a) {
return ((AMBlackListingRequestPBImpl) a).getProto();
}
@Override
public String getNodeLabelExpression() {
ApplicationSubmissionContextProtoOrBuilder p = viaProto ? proto : builder;

View File

@ -2670,25 +2670,6 @@
<value>4096</value>
</property>
<property>
<description>
Enable/disable blacklisting of hosts for AM based on AM failures on those
hosts.
</description>
<name>yarn.am.blacklisting.enabled</name>
<value>true</value>
</property>
<property>
<description>
Threshold of ratio number of NodeManager hosts that are allowed to be
blacklisted for AM. Beyond this ratio there is no blacklisting to avoid
danger of blacklisting the entire cluster.
</description>
<name>yarn.am.blacklisting.disable-failure-threshold</name>
<value>0.8f</value>
</property>
<property>
<description>
Choose different implementation of node label's storage

View File

@ -109,7 +109,6 @@ import org.apache.hadoop.yarn.api.protocolrecords.impl.pb.IncreaseContainersReso
import org.apache.hadoop.yarn.api.protocolrecords.impl.pb.IncreaseContainersResourceResponsePBImpl;
import org.apache.hadoop.yarn.api.protocolrecords.impl.pb.SubmitApplicationRequestPBImpl;
import org.apache.hadoop.yarn.api.protocolrecords.impl.pb.SubmitApplicationResponsePBImpl;
import org.apache.hadoop.yarn.api.records.AMBlackListingRequest;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptReport;
import org.apache.hadoop.yarn.api.records.ApplicationId;
@ -155,7 +154,6 @@ import org.apache.hadoop.yarn.api.records.StrictPreemptionContract;
import org.apache.hadoop.yarn.api.records.Token;
import org.apache.hadoop.yarn.api.records.URL;
import org.apache.hadoop.yarn.api.records.YarnClusterMetrics;
import org.apache.hadoop.yarn.api.records.impl.pb.AMBlackListingRequestPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationAttemptIdPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationAttemptReportPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationIdPBImpl;
@ -190,7 +188,6 @@ import org.apache.hadoop.yarn.api.records.impl.pb.StrictPreemptionContractPBImpl
import org.apache.hadoop.yarn.api.records.impl.pb.TokenPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.URLPBImpl;
import org.apache.hadoop.yarn.api.records.impl.pb.YarnClusterMetricsPBImpl;
import org.apache.hadoop.yarn.proto.YarnProtos.AMBlackListingRequestProto;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationAttemptIdProto;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationAttemptReportProto;
import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationIdProto;
@ -515,7 +512,6 @@ public class TestPBImplRecords {
generateByNewInstance(ResourceAllocationRequest.class);
generateByNewInstance(ReservationAllocationState.class);
generateByNewInstance(ResourceUtilization.class);
generateByNewInstance(AMBlackListingRequest.class);
}
private class GetSetPair {
@ -1355,10 +1351,4 @@ public class TestPBImplRecords {
validatePBImplRecord(CheckForDecommissioningNodesResponsePBImpl.class,
CheckForDecommissioningNodesResponseProto.class);
}
@Test
public void testAMBlackListingRequestPBImpl() throws Exception {
validatePBImplRecord(AMBlackListingRequestPBImpl.class,
AMBlackListingRequestProto.class);
}
}

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.yarn.server.resourcemanager.blacklist;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.yarn.api.records.ResourceBlacklistRequest;
/**
* Tracks blacklists based on failures reported on nodes.
@ -33,14 +34,14 @@ public interface BlacklistManager {
void addNode(String node);
/**
* Get {@link BlacklistUpdates} that indicate which nodes should be
* Get {@link ResourceBlacklistRequest} that indicate which nodes should be
* added or to removed from the blacklist.
* @return {@link BlacklistUpdates}
* @return {@link ResourceBlacklistRequest}
*/
BlacklistUpdates getBlacklistUpdates();
ResourceBlacklistRequest getBlacklistUpdates();
/**
* Refresh the number of nodemanager hosts available for scheduling.
* Refresh the number of NodeManagers available for scheduling.
* @param nodeHostCount is the number of node hosts.
*/
void refreshNodeHostCount(int nodeHostCount);

View File

@ -1,47 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.resourcemanager.blacklist;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import java.util.List;
/**
* Class to track blacklist additions and removals.
*/
@Private
public class BlacklistUpdates {
private List<String> additions;
private List<String> removals;
public BlacklistUpdates(List<String> additions,
List<String> removals) {
this.additions = additions;
this.removals = removals;
}
public List<String> getAdditions() {
return additions;
}
public List<String> getRemovals() {
return removals;
}
}

View File

@ -20,21 +20,23 @@ package org.apache.hadoop.yarn.server.resourcemanager.blacklist;
import java.util.ArrayList;
import org.apache.hadoop.yarn.api.records.ResourceBlacklistRequest;
/**
* A {@link BlacklistManager} that returns no blacklists.
*/
public class DisabledBlacklistManager implements BlacklistManager{
public class DisabledBlacklistManager implements BlacklistManager {
private static final ArrayList<String> EMPTY_LIST = new ArrayList<String>();
private BlacklistUpdates noBlacklist =
new BlacklistUpdates(EMPTY_LIST, EMPTY_LIST);
private ResourceBlacklistRequest noBlacklist =
ResourceBlacklistRequest.newInstance(EMPTY_LIST, EMPTY_LIST);
@Override
public void addNode(String node) {
}
@Override
public BlacklistUpdates getBlacklistUpdates() {
public ResourceBlacklistRequest getBlacklistUpdates() {
return noBlacklist;
}
@ -42,4 +44,4 @@ public class DisabledBlacklistManager implements BlacklistManager{
public void refreshNodeHostCount(int nodeHostCount) {
// Do nothing
}
}
}

View File

@ -18,14 +18,15 @@
package org.apache.hadoop.yarn.server.resourcemanager.blacklist;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.yarn.api.records.ResourceBlacklistRequest;
/**
* Maintains a list of failed nodes and returns that as long as number of
* blacklisted nodes is below a threshold percentage of total nodes. If more
@ -58,8 +59,8 @@ public class SimpleBlacklistManager implements BlacklistManager {
}
@Override
public BlacklistUpdates getBlacklistUpdates() {
BlacklistUpdates ret;
public ResourceBlacklistRequest getBlacklistUpdates() {
ResourceBlacklistRequest ret;
List<String> blacklist = new ArrayList<>(blacklistNodes);
final int currentBlacklistSize = blacklist.size();
final double failureThreshold = this.blacklistDisableFailureThreshold *
@ -70,13 +71,15 @@ public class SimpleBlacklistManager implements BlacklistManager {
"failure threshold ratio " + blacklistDisableFailureThreshold +
" out of total usable nodes " + numberOfNodeManagerHosts);
}
ret = new BlacklistUpdates(blacklist, EMPTY_LIST);
ret = ResourceBlacklistRequest.newInstance(blacklist, EMPTY_LIST);
} else {
LOG.warn("Ignoring Blacklists, blacklist size " + currentBlacklistSize
+ " is more than failure threshold ratio "
+ blacklistDisableFailureThreshold + " out of total usable nodes "
+ numberOfNodeManagerHosts);
ret = new BlacklistUpdates(EMPTY_LIST, blacklist);
// TODO: After the threshold hits, we will keep sending a long list
// every time a new AM is to be scheduled.
ret = ResourceBlacklistRequest.newInstance(EMPTY_LIST, blacklist);
}
return ret;
}

View File

@ -391,8 +391,8 @@ public class RMAppImpl implements RMApp, Recoverable {
stateMachine;
private static final int DUMMY_APPLICATION_ATTEMPT_NUMBER = -1;
private static final float MINIMUM_THRESHOLD_VALUE = 0.0f;
private static final float MAXIMUM_THRESHOLD_VALUE = 1.0f;
private static final float MINIMUM_AM_BLACKLIST_THRESHOLD_VALUE = 0.0f;
private static final float MAXIMUM_AM_BLACKLIST_THRESHOLD_VALUE = 1.0f;
public RMAppImpl(ApplicationId applicationId, RMContext rmContext,
Configuration config, String name, String user, String queue,
@ -471,42 +471,24 @@ public class RMAppImpl implements RMApp, Recoverable {
YarnConfiguration.RM_MAX_LOG_AGGREGATION_DIAGNOSTICS_IN_MEMORY,
YarnConfiguration.DEFAULT_RM_MAX_LOG_AGGREGATION_DIAGNOSTICS_IN_MEMORY);
// amBlacklistingEnabled can be configured globally and by each
// application.
// Case 1: If AMBlackListRequest is available in submission context, we
// will consider only app level request (RM level configuration will be
// skipped).
// Case 2: AMBlackListRequest is available in submission context and
// amBlacklisting is disabled. In this case, AM blacklisting wont be
// enabled for this app even if this feature is enabled in RM level.
// Case 3: AMBlackListRequest is not available through submission context.
// RM level AM black listing configuration will be considered.
if (null != submissionContext.getAMBlackListRequest()) {
amBlacklistingEnabled = submissionContext.getAMBlackListRequest()
.isAMBlackListingEnabled();
blacklistDisableThreshold = 0.0f;
if (amBlacklistingEnabled) {
blacklistDisableThreshold = submissionContext.getAMBlackListRequest()
.getBlackListingDisableFailureThreshold();
// Verify whether blacklistDisableThreshold is valid. And for invalid
// threshold, reset to global level blacklistDisableThreshold
// configured.
if (blacklistDisableThreshold < MINIMUM_THRESHOLD_VALUE
|| blacklistDisableThreshold > MAXIMUM_THRESHOLD_VALUE) {
blacklistDisableThreshold = conf.getFloat(
YarnConfiguration.AM_BLACKLISTING_DISABLE_THRESHOLD,
YarnConfiguration.DEFAULT_AM_BLACKLISTING_DISABLE_THRESHOLD);
}
}
} else {
amBlacklistingEnabled = conf.getBoolean(
YarnConfiguration.AM_BLACKLISTING_ENABLED,
YarnConfiguration.DEFAULT_AM_BLACKLISTING_ENABLED);
if (amBlacklistingEnabled) {
blacklistDisableThreshold = conf.getFloat(
YarnConfiguration.AM_BLACKLISTING_DISABLE_THRESHOLD,
YarnConfiguration.DEFAULT_AM_BLACKLISTING_DISABLE_THRESHOLD);
// amBlacklistingEnabled can be configured globally
// Just use the global values
amBlacklistingEnabled =
conf.getBoolean(
YarnConfiguration.AM_SCHEDULING_NODE_BLACKLISTING_ENABLED,
YarnConfiguration.DEFAULT_AM_SCHEDULING_NODE_BLACKLISTING_ENABLED);
if (amBlacklistingEnabled) {
blacklistDisableThreshold = conf.getFloat(
YarnConfiguration.AM_SCHEDULING_NODE_BLACKLISTING_DISABLE_THRESHOLD,
YarnConfiguration.
DEFAULT_AM_SCHEDULING_NODE_BLACKLISTING_DISABLE_THRESHOLD);
// Verify whether blacklistDisableThreshold is valid. And for invalid
// threshold, reset to global level blacklistDisableThreshold
// configured.
if (blacklistDisableThreshold < MINIMUM_AM_BLACKLIST_THRESHOLD_VALUE ||
blacklistDisableThreshold > MAXIMUM_AM_BLACKLIST_THRESHOLD_VALUE) {
blacklistDisableThreshold = YarnConfiguration.
DEFAULT_AM_SCHEDULING_NODE_BLACKLISTING_DISABLE_THRESHOLD;
}
}
}
@ -877,15 +859,16 @@ public class RMAppImpl implements RMApp, Recoverable {
}
private void createNewAttempt(ApplicationAttemptId appAttemptId) {
BlacklistManager currentAMBlacklist;
BlacklistManager currentAMBlacklistManager;
if (currentAttempt != null) {
currentAMBlacklist = currentAttempt.getAMBlacklist();
// Transfer over the blacklist from the previous app-attempt.
currentAMBlacklistManager = currentAttempt.getAMBlacklistManager();
} else {
if (amBlacklistingEnabled) {
currentAMBlacklist = new SimpleBlacklistManager(
currentAMBlacklistManager = new SimpleBlacklistManager(
scheduler.getNumClusterNodes(), blacklistDisableThreshold);
} else {
currentAMBlacklist = new DisabledBlacklistManager();
currentAMBlacklistManager = new DisabledBlacklistManager();
}
}
RMAppAttempt attempt =
@ -896,7 +879,7 @@ public class RMAppImpl implements RMApp, Recoverable {
// hardware error and NM resync) + 1) equal to the max-attempt
// limit.
maxAppAttempts == (getNumFailedAppAttempts() + 1), amReq,
currentAMBlacklist);
currentAMBlacklistManager);
attempts.put(appAttemptId, attempt);
currentAttempt = attempt;
}
@ -1825,16 +1808,6 @@ public class RMAppImpl implements RMApp, Recoverable {
rmContext.getSystemMetricsPublisher().appCreated(app, startTime);
}
@VisibleForTesting
public boolean isAmBlacklistingEnabled() {
return amBlacklistingEnabled;
}
@VisibleForTesting
public float getAmBlacklistingDisableThreshold() {
return blacklistDisableThreshold;
}
@Private
@VisibleForTesting
public int getNextAttemptId() {

View File

@ -190,7 +190,7 @@ public interface RMAppAttempt extends EventHandler<RMAppAttemptEvent> {
* Get the {@link BlacklistManager} that manages blacklists for AM failures
* @return the {@link BlacklistManager} that tracks AM failures.
*/
BlacklistManager getAMBlacklist();
BlacklistManager getAMBlacklistManager();
/**
* the start time of the application.

View File

@ -57,6 +57,7 @@ import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.ResourceBlacklistRequest;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.api.records.YarnApplicationAttemptState;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
@ -73,7 +74,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncherEvent;
import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncherEventType;
import org.apache.hadoop.yarn.server.resourcemanager.blacklist.BlacklistManager;
import org.apache.hadoop.yarn.server.resourcemanager.blacklist.BlacklistUpdates;
import org.apache.hadoop.yarn.server.resourcemanager.blacklist.DisabledBlacklistManager;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
@ -492,7 +492,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
ApplicationMasterService masterService,
ApplicationSubmissionContext submissionContext,
Configuration conf, boolean maybeLastAttempt, ResourceRequest amReq,
BlacklistManager amBlacklist) {
BlacklistManager amBlacklistManager) {
this.conf = conf;
this.applicationAttemptId = appAttemptId;
this.rmContext = rmContext;
@ -511,9 +511,9 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
this.attemptMetrics =
new RMAppAttemptMetrics(applicationAttemptId, rmContext);
this.amReq = amReq;
this.blacklistedNodesForAM = amBlacklist;
this.blacklistedNodesForAM = amBlacklistManager;
}
@Override
@ -1031,15 +1031,15 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
appAttempt.amReq.setResourceName(ResourceRequest.ANY);
appAttempt.amReq.setRelaxLocality(true);
appAttempt.getAMBlacklist().refreshNodeHostCount(
appAttempt.getAMBlacklistManager().refreshNodeHostCount(
appAttempt.scheduler.getNumClusterNodes());
BlacklistUpdates amBlacklist = appAttempt.getAMBlacklist()
.getBlacklistUpdates();
ResourceBlacklistRequest amBlacklist =
appAttempt.getAMBlacklistManager().getBlacklistUpdates();
if (LOG.isDebugEnabled()) {
LOG.debug("Using blacklist for AM: additions(" +
amBlacklist.getAdditions() + ") and removals(" +
amBlacklist.getRemovals() + ")");
amBlacklist.getBlacklistAdditions() + ") and removals(" +
amBlacklist.getBlacklistRemovals() + ")");
}
// AM resource has been checked when submission
Allocation amContainerAllocation =
@ -1047,8 +1047,8 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
appAttempt.applicationAttemptId,
Collections.singletonList(appAttempt.amReq),
EMPTY_CONTAINER_RELEASE_LIST,
amBlacklist.getAdditions(),
amBlacklist.getRemovals(), null, null);
amBlacklist.getBlacklistAdditions(),
amBlacklist.getBlacklistRemovals(), null, null);
if (amContainerAllocation != null
&& amContainerAllocation.getContainers() != null) {
assert (amContainerAllocation.getContainers().size() == 0);
@ -1481,9 +1481,36 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
}
}
private boolean shouldCountTowardsNodeBlacklisting(int exitStatus) {
return !(exitStatus == ContainerExitStatus.SUCCESS
|| exitStatus == ContainerExitStatus.PREEMPTED);
private static boolean shouldCountTowardsNodeBlacklisting(int exitStatus) {
switch (exitStatus) {
case ContainerExitStatus.PREEMPTED:
case ContainerExitStatus.KILLED_BY_RESOURCEMANAGER:
case ContainerExitStatus.KILLED_BY_APPMASTER:
case ContainerExitStatus.KILLED_AFTER_APP_COMPLETION:
case ContainerExitStatus.ABORTED:
// Neither the app's fault nor the system's fault. This happens by design,
// so no need for skipping nodes
return false;
case ContainerExitStatus.DISKS_FAILED:
// This container is marked with this exit-status means that the node is
// already marked as unhealthy given that most of the disks failed. So, no
// need for any explicit skipping of nodes.
return false;
case ContainerExitStatus.KILLED_EXCEEDED_VMEM:
case ContainerExitStatus.KILLED_EXCEEDED_PMEM:
// No point in skipping the node as it's not the system's fault
return false;
case ContainerExitStatus.SUCCESS:
return false;
case ContainerExitStatus.INVALID:
// Ideally, this shouldn't be considered for skipping a node. But in
// reality, it seems like there are cases where we are not setting
// exit-code correctly and so it's better to be conservative. See
// YARN-4284.
return true;
default:
return true;
}
}
private static final class UnmanagedAMAttemptSavedTransition
@ -1803,7 +1830,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
if (appAttempt.masterContainer != null
&& appAttempt.masterContainer.getId().equals(
containerStatus.getContainerId())) {
appAttempt.sendAMContainerToNM(appAttempt, containerFinishedEvent);
appAttempt.amContainerFinished(appAttempt, containerFinishedEvent);
// Remember the follow up transition and save the final attempt state.
appAttempt.rememberTargetTransitionsAndStoreState(event,
@ -1848,13 +1875,17 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
// Add am container to the list so that am container instance will be
// removed from NMContext.
private void sendAMContainerToNM(RMAppAttemptImpl appAttempt,
private static void amContainerFinished(RMAppAttemptImpl appAttempt,
RMAppAttemptContainerFinishedEvent containerFinishedEvent) {
NodeId nodeId = containerFinishedEvent.getNodeId();
if (containerFinishedEvent.getContainerStatus() != null) {
if (shouldCountTowardsNodeBlacklisting(containerFinishedEvent
.getContainerStatus().getExitStatus())) {
appAttempt.addAMNodeToBlackList(containerFinishedEvent.getNodeId());
ContainerStatus containerStatus =
containerFinishedEvent.getContainerStatus();
if (containerStatus != null) {
int exitStatus = containerStatus.getExitStatus();
if (shouldCountTowardsNodeBlacklisting(exitStatus)) {
appAttempt.addAMNodeToBlackList(nodeId);
}
} else {
LOG.warn("No ContainerStatus in containerFinishedEvent");
@ -1862,14 +1893,13 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
if (!appAttempt.getSubmissionContext()
.getKeepContainersAcrossApplicationAttempts()) {
finishedContainersSentToAM.putIfAbsent(nodeId,
appAttempt.finishedContainersSentToAM.putIfAbsent(nodeId,
new ArrayList<ContainerStatus>());
appAttempt.finishedContainersSentToAM.get(nodeId).add(
containerFinishedEvent.getContainerStatus());
appAttempt.finishedContainersSentToAM.get(nodeId).add(containerStatus);
appAttempt.sendFinishedContainersToNM();
} else {
appAttempt.sendFinishedAMContainerToNM(nodeId,
containerFinishedEvent.getContainerStatus().getContainerId());
containerStatus.getContainerId());
}
}
@ -1884,7 +1914,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
}
@Override
public BlacklistManager getAMBlacklist() {
public BlacklistManager getAMBlacklistManager() {
return blacklistedNodesForAM;
}
@ -1943,7 +1973,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
containerStatus.getContainerId())) {
new FinalTransition(RMAppAttemptState.FINISHED).transition(
appAttempt, containerFinishedEvent);
appAttempt.sendAMContainerToNM(appAttempt, containerFinishedEvent);
appAttempt.amContainerFinished(appAttempt, containerFinishedEvent);
return RMAppAttemptState.FINISHED;
}
// Add all finished containers so that they can be acked to NM.
@ -1968,7 +1998,8 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
// Thus, we still return FINAL_SAVING state here.
if (appAttempt.masterContainer.getId().equals(
containerStatus.getContainerId())) {
appAttempt.sendAMContainerToNM(appAttempt, containerFinishedEvent);
appAttempt.amContainerFinished(appAttempt, containerFinishedEvent);
if (appAttempt.targetedFinalState.equals(RMAppAttemptState.FAILED)
|| appAttempt.targetedFinalState.equals(RMAppAttemptState.KILLED)) {

View File

@ -178,7 +178,7 @@ public abstract class AbstractYarnScheduler
NodeFilter nodeFilter = new NodeFilter() {
@Override
public boolean accept(SchedulerNode node) {
return SchedulerAppUtils.isBlacklisted(app, node, LOG);
return SchedulerAppUtils.isPlaceBlacklisted(app, node, LOG);
}
};
return nodeTracker.getNodes(nodeFilter);

View File

@ -73,9 +73,13 @@ public class AppSchedulingInfo {
private ActiveUsersManager activeUsersManager;
private boolean pending = true; // whether accepted/allocated by scheduler
private ResourceUsage appResourceUsage;
private AtomicBoolean userBlacklistChanged = new AtomicBoolean(false);
private final Set<String> amBlacklist = new HashSet<>();
private Set<String> userBlacklist = new HashSet<>();
// Set of places (nodes / racks) blacklisted by the system. Today, this only
// has places blacklisted for AM containers.
private final Set<String> placesBlacklistedBySystem = new HashSet<>();
private Set<String> placesBlacklistedByApp = new HashSet<>();
private Set<String> requestedPartitions = new HashSet<>();
final Set<Priority> priorities = new TreeSet<>(COMPARATOR);
@ -447,32 +451,38 @@ public class AppSchedulingInfo {
}
/**
* The ApplicationMaster is updating the userBlacklist used for containers
* other than AMs.
* The ApplicationMaster is updating the placesBlacklistedByApp used for
* containers other than AMs.
*
* @param blacklistAdditions resources to be added to the userBlacklist
* @param blacklistRemovals resources to be removed from the userBlacklist
* @param blacklistAdditions
* resources to be added to the userBlacklist
* @param blacklistRemovals
* resources to be removed from the userBlacklist
*/
public void updateBlacklist(
public void updatePlacesBlacklistedByApp(
List<String> blacklistAdditions, List<String> blacklistRemovals) {
if (updateUserOrAMBlacklist(userBlacklist, blacklistAdditions,
if (updateBlacklistedPlaces(placesBlacklistedByApp, blacklistAdditions,
blacklistRemovals)) {
userBlacklistChanged.set(true);
}
}
/**
* RM is updating blacklist for AM containers.
* @param blacklistAdditions resources to be added to the amBlacklist
* @param blacklistRemovals resources to be added to the amBlacklist
* Update the list of places that are blacklisted by the system. Today the
* system only blacklists places when it sees that AMs failed there
*
* @param blacklistAdditions
* resources to be added to placesBlacklistedBySystem
* @param blacklistRemovals
* resources to be removed from placesBlacklistedBySystem
*/
public void updateAMBlacklist(
public void updatePlacesBlacklistedBySystem(
List<String> blacklistAdditions, List<String> blacklistRemovals) {
updateUserOrAMBlacklist(amBlacklist, blacklistAdditions,
updateBlacklistedPlaces(placesBlacklistedBySystem, blacklistAdditions,
blacklistRemovals);
}
boolean updateUserOrAMBlacklist(Set<String> blacklist,
private static boolean updateBlacklistedPlaces(Set<String> blacklist,
List<String> blacklistAdditions, List<String> blacklistRemovals) {
boolean changed = false;
synchronized (blacklist) {
@ -481,9 +491,7 @@ public class AppSchedulingInfo {
}
if (blacklistRemovals != null) {
if (blacklist.removeAll(blacklistRemovals)) {
changed = true;
}
changed = blacklist.removeAll(blacklistRemovals) || changed;
}
}
return changed;
@ -522,20 +530,24 @@ public class AppSchedulingInfo {
}
/**
* Returns if the node is either blacklisted by the user or the system
* @param resourceName the resourcename
* @param useAMBlacklist true if it should check amBlacklist
* Returns if the place (node/rack today) is either blacklisted by the
* application (user) or the system
*
* @param resourceName
* the resourcename
* @param blacklistedBySystem
* true if it should check amBlacklist
* @return true if its blacklisted
*/
public boolean isBlacklisted(String resourceName,
boolean useAMBlacklist) {
if (useAMBlacklist){
synchronized (amBlacklist) {
return amBlacklist.contains(resourceName);
public boolean isPlaceBlacklisted(String resourceName,
boolean blacklistedBySystem) {
if (blacklistedBySystem){
synchronized (placesBlacklistedBySystem) {
return placesBlacklistedBySystem.contains(resourceName);
}
} else {
synchronized (userBlacklist) {
return userBlacklist.contains(resourceName);
synchronized (placesBlacklistedByApp) {
return placesBlacklistedByApp.contains(resourceName);
}
}
}
@ -773,12 +785,12 @@ public class AppSchedulingInfo {
}
public Set<String> getBlackList() {
return this.userBlacklist;
return this.placesBlacklistedByApp;
}
public Set<String> getBlackListCopy() {
synchronized (userBlacklist) {
return new HashSet<>(this.userBlacklist);
synchronized (placesBlacklistedByApp) {
return new HashSet<>(this.placesBlacklistedByApp);
}
}
@ -786,7 +798,7 @@ public class AppSchedulingInfo {
AppSchedulingInfo appInfo) {
// This should not require locking the userBlacklist since it will not be
// used by this instance until after setCurrentAppAttempt.
this.userBlacklist = appInfo.getBlackList();
this.placesBlacklistedByApp = appInfo.getBlackList();
}
public synchronized void recoverContainer(RMContainer rmContainer) {

View File

@ -22,20 +22,20 @@ import org.apache.commons.logging.Log;
public class SchedulerAppUtils {
public static boolean isBlacklisted(SchedulerApplicationAttempt application,
SchedulerNode node, Log LOG) {
if (application.isBlacklisted(node.getNodeName())) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping 'host' " + node.getNodeName() +
public static boolean isPlaceBlacklisted(
SchedulerApplicationAttempt application, SchedulerNode node, Log log) {
if (application.isPlaceBlacklisted(node.getNodeName())) {
if (log.isDebugEnabled()) {
log.debug("Skipping 'host' " + node.getNodeName() +
" for " + application.getApplicationId() +
" since it has been blacklisted");
}
return true;
}
if (application.isBlacklisted(node.getRackName())) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping 'rack' " + node.getRackName() +
if (application.isPlaceBlacklisted(node.getRackName())) {
if (log.isDebugEnabled()) {
log.debug("Skipping 'rack' " + node.getRackName() +
" for " + application.getApplicationId() +
" since it has been blacklisted");
}

View File

@ -592,27 +592,26 @@ public class SchedulerApplicationAttempt implements SchedulableEntity {
return (!unmanagedAM && appAttempt.getMasterContainer() == null);
}
// Blacklist used for user containers
public synchronized void updateBlacklist(
List<String> blacklistAdditions, List<String> blacklistRemovals) {
public synchronized void updateBlacklist(List<String> blacklistAdditions,
List<String> blacklistRemovals) {
if (!isStopped) {
this.appSchedulingInfo.updateBlacklist(
blacklistAdditions, blacklistRemovals);
if (isWaitingForAMContainer()) {
// The request is for the AM-container, and the AM-container is launched
// by the system. So, update the places that are blacklisted by system
// (as opposed to those blacklisted by the application).
this.appSchedulingInfo.updatePlacesBlacklistedBySystem(
blacklistAdditions, blacklistRemovals);
} else {
this.appSchedulingInfo.updatePlacesBlacklistedByApp(blacklistAdditions,
blacklistRemovals);
}
}
}
// Blacklist used for AM containers
public synchronized void updateAMBlacklist(
List<String> blacklistAdditions, List<String> blacklistRemovals) {
if (!isStopped) {
this.appSchedulingInfo.updateAMBlacklist(
blacklistAdditions, blacklistRemovals);
}
}
public boolean isBlacklisted(String resourceName) {
boolean useAMBlacklist = isWaitingForAMContainer();
return this.appSchedulingInfo.isBlacklisted(resourceName, useAMBlacklist);
public boolean isPlaceBlacklisted(String resourceName) {
boolean forAMContainer = isWaitingForAMContainer();
return this.appSchedulingInfo.isPlaceBlacklisted(resourceName,
forAMContainer);
}
public synchronized int addMissedNonPartitionedRequestSchedulingOpportunity(

View File

@ -996,15 +996,8 @@ public class CapacityScheduler extends
application.showRequests();
}
}
if (application.isWaitingForAMContainer()) {
// Allocate is for AM and update AM blacklist for this
application.updateAMBlacklist(
blacklistAdditions, blacklistRemovals);
} else {
application.updateBlacklist(blacklistAdditions, blacklistRemovals);
}
application.updateBlacklist(blacklistAdditions, blacklistRemovals);
allocation = application.getAllocation(getResourceCalculator(),
getClusterResource(), getMinimumResourceCapability());

View File

@ -81,7 +81,7 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
private ContainerAllocation preCheckForNewContainer(Resource clusterResource,
FiCaSchedulerNode node, SchedulingMode schedulingMode,
ResourceLimits resourceLimits, Priority priority) {
if (SchedulerAppUtils.isBlacklisted(application, node, LOG)) {
if (SchedulerAppUtils.isPlaceBlacklisted(application, node, LOG)) {
application.updateAppSkipNodeDiagnostics(
CSAMContainerLaunchDiagnosticsConstants.SKIP_AM_ALLOCATION_IN_BLACK_LISTED_NODE);
return ContainerAllocation.APP_SKIPPED;

View File

@ -332,7 +332,7 @@ public class FSLeafQueue extends FSQueue {
readLock.unlock();
}
for (FSAppAttempt sched : pendingForResourceApps) {
if (SchedulerAppUtils.isBlacklisted(sched, node, LOG)) {
if (SchedulerAppUtils.isPlaceBlacklisted(sched, node, LOG)) {
continue;
}
assigned = sched.assignContainer(node);

View File

@ -993,13 +993,7 @@ public class FairScheduler extends
preemptionContainerIds.add(container.getContainerId());
}
if (application.isWaitingForAMContainer()) {
// Allocate is for AM and update AM blacklist for this
application.updateAMBlacklist(
blacklistAdditions, blacklistRemovals);
} else {
application.updateBlacklist(blacklistAdditions, blacklistRemovals);
}
application.updateBlacklist(blacklistAdditions, blacklistRemovals);
List<Container> newlyAllocatedContainers =
application.pullNewlyAllocatedContainers();

View File

@ -375,13 +375,7 @@ public class FifoScheduler extends
" #ask=" + ask.size());
}
if (application.isWaitingForAMContainer()) {
// Allocate is for AM and update AM blacklist for this
application.updateAMBlacklist(
blacklistAdditions, blacklistRemovals);
} else {
application.updateBlacklist(blacklistAdditions, blacklistRemovals);
}
application.updateBlacklist(blacklistAdditions, blacklistRemovals);
Resource headroom = application.getHeadroom();
application.setApplicationHeadroomForMetrics(headroom);
@ -516,10 +510,10 @@ public class FifoScheduler extends
application.showRequests();
synchronized (application) {
// Check if this resource is on the blacklist
if (SchedulerAppUtils.isBlacklisted(application, node, LOG)) {
if (SchedulerAppUtils.isPlaceBlacklisted(application, node, LOG)) {
continue;
}
for (Priority priority : application.getPriorities()) {
int maxContainers =
getMaxAllocatableContainers(application, priority, node,

View File

@ -220,8 +220,9 @@ public class RMAppAttemptBlock extends AppAttemptBlock{
String appBlacklistedNodes =
getNodeString(rmAppAttempt.getBlacklistedNodes());
// nodes which are blacklisted by the RM for AM launches
String rmBlackListedNodes = getNodeString(
rmAppAttempt.getAMBlacklist().getBlacklistUpdates().getAdditions());
String rmBlackListedNodes =
getNodeString(rmAppAttempt.getAMBlacklistManager()
.getBlacklistUpdates().getBlacklistAdditions());
info("Application Attempt Overview")
._(
@ -256,8 +257,8 @@ public class RMAppAttemptBlock extends AppAttemptBlock{
"Diagnostics Info:",
appAttempt.getDiagnosticsInfo() == null ? "" : appAttempt
.getDiagnosticsInfo())
._("Application Blacklisted Nodes:", appBlacklistedNodes)
._("RM Blacklisted Nodes(for AM launches)", rmBlackListedNodes);
._("Nodes blacklisted by the application:", appBlacklistedNodes)
._("Nodes blacklisted by the system:", rmBlackListedNodes);
}
private String getNodeString(Collection<String> nodes) {

View File

@ -113,10 +113,10 @@ public class RMAppBlock extends AppBlock{
Hamlet.TBODY<Hamlet.TABLE<Hamlet>> tbody =
html.table("#attempts").thead().tr().th(".id", "Attempt ID")
.th(".started", "Started").th(".node", "Node").th(".logs", "Logs")
.th(".appBlacklistednodes", "Nodes black listed by the application",
"App Blacklisted Nodes")
.th(".rmBlacklistednodes", "Nodes black listed by the RM for the"
+ " app", "RM Blacklisted Nodes")._()._().tbody();
.th(".appBlacklistednodes", "Nodes blacklisted by the application",
"Nodes blacklisted by the app")
.th(".rmBlacklistednodes", "Nodes blacklisted by the RM for the"
+ " app", "Nodes blacklisted by the system")._()._().tbody();
RMApp rmApp = this.rm.getRMContext().getRMApps().get(this.appID);
if (rmApp == null) {
@ -136,8 +136,9 @@ public class RMAppBlock extends AppBlock{
// nodes which are blacklisted by the application
String appBlacklistedNodesCount = String.valueOf(nodes.size());
// nodes which are blacklisted by the RM for AM launches
String rmBlacklistedNodesCount = String.valueOf(rmAppAttempt
.getAMBlacklist().getBlacklistUpdates().getAdditions().size());
String rmBlacklistedNodesCount =
String.valueOf(rmAppAttempt.getAMBlacklistManager()
.getBlacklistUpdates().getBlacklistAdditions().size());
String nodeLink = attemptInfo.getNodeHttpAddress();
if (nodeLink != null) {
nodeLink = WebAppUtils.getHttpSchemePrefix(conf) + nodeLink;

View File

@ -85,12 +85,15 @@ import org.apache.hadoop.yarn.api.protocolrecords.KillApplicationResponse;
import org.apache.hadoop.yarn.api.protocolrecords.MoveApplicationAcrossQueuesRequest;
import org.apache.hadoop.yarn.api.protocolrecords.RenewDelegationTokenRequest;
import org.apache.hadoop.yarn.api.protocolrecords.RenewDelegationTokenResponse;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationDeleteRequest;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationListRequest;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationListResponse;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionRequest;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionResponse;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationUpdateRequest;
import org.apache.hadoop.yarn.api.protocolrecords.SubmitApplicationRequest;
import org.apache.hadoop.yarn.api.protocolrecords.SubmitApplicationResponse;
import org.apache.hadoop.yarn.api.protocolrecords.UpdateApplicationPriorityRequest;
import org.apache.hadoop.yarn.api.records.AMBlackListingRequest;
import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
@ -130,7 +133,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AMBlackListingRequestInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppAttemptInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppAttemptsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppInfo;
@ -168,10 +170,6 @@ import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ReservationReque
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ReservationSubmissionRequestInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ReservationUpdateRequestInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ReservationUpdateResponseInfo;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationDeleteRequest;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionRequest;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionResponse;
import org.apache.hadoop.yarn.api.protocolrecords.ReservationUpdateRequest;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ResourceInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.SchedulerInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.SchedulerTypeInfo;
@ -1499,10 +1497,6 @@ public class RMWebServices extends WebServices {
reservationIdStr);
appContext.setReservationID(reservationId);
}
if (newApp.getAMBlackListingRequestInfo() != null) {
appContext.setAMBlackListRequest(createAMBlackListingRequest(
newApp.getAMBlackListingRequestInfo()));
}
return appContext;
}
@ -1651,13 +1645,6 @@ public class RMWebServices extends WebServices {
logAggregationContextInfo.getLogAggregationPolicyParameters());
}
private AMBlackListingRequest createAMBlackListingRequest(
AMBlackListingRequestInfo amBlackListingRequestInfo) {
return AMBlackListingRequest.newInstance(
amBlackListingRequestInfo.getAMBlackListingEnabled(),
amBlackListingRequestInfo.getBlackListingDisableFailureThreshold());
}
@POST
@Path("/delegation-token")
@Produces({ MediaType.APPLICATION_JSON, MediaType.APPLICATION_XML })

View File

@ -1,61 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.resourcemanager.webapp.dao;
import javax.xml.bind.annotation.XmlAccessType;
import javax.xml.bind.annotation.XmlAccessorType;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
/**
* Simple class to allow users to send information required to create a
* AMBlackListingRequest which can then be used as part of the
* ApplicationSubmissionContext
*
*/
@XmlRootElement(name = "am-black-listing-requests")
@XmlAccessorType(XmlAccessType.FIELD)
public class AMBlackListingRequestInfo {
@XmlElement(name = "am-black-listing-enabled")
boolean isAMBlackListingEnabled;
@XmlElement(name = "disable-failure-threshold")
float disableFailureThreshold;
public AMBlackListingRequestInfo() {
}
public boolean getAMBlackListingEnabled() {
return isAMBlackListingEnabled;
}
public void setAMBlackListingEnabled(boolean isAMBlackListingEnabled) {
this.isAMBlackListingEnabled = isAMBlackListingEnabled;
}
public float getBlackListingDisableFailureThreshold() {
return disableFailureThreshold;
}
public void setBlackListingDisableFailureThreshold(
float disableFailureThreshold) {
this.disableFailureThreshold = disableFailureThreshold;
}
}

View File

@ -42,7 +42,7 @@ public class AppAttemptInfo {
protected String nodeId;
protected String logsLink;
protected String blacklistedNodes;
protected String rmBlacklistedNodesForAMLaunches;
private String nodesBlacklistedBySystem;
protected String appAttemptId;
public AppAttemptInfo() {
@ -69,9 +69,9 @@ public class AppAttemptInfo {
+ masterContainer.getNodeHttpAddress(),
ConverterUtils.toString(masterContainer.getId()), user);
rmBlacklistedNodesForAMLaunches = StringUtils.join(
attempt.getAMBlacklist().getBlacklistUpdates().getAdditions(),
", ");
nodesBlacklistedBySystem =
StringUtils.join(attempt.getAMBlacklistManager()
.getBlacklistUpdates().getBlacklistAdditions(), ", ");
if (rm.getResourceScheduler() instanceof AbstractYarnScheduler) {
AbstractYarnScheduler ayScheduler =
(AbstractYarnScheduler) rm.getResourceScheduler();

View File

@ -87,9 +87,6 @@ public class ApplicationSubmissionContextInfo {
@XmlElement(name = "reservation-id")
String reservationId;
@XmlElement(name = "am-black-listing-requests")
AMBlackListingRequestInfo amBlackListingRequestInfo;
public ApplicationSubmissionContextInfo() {
applicationId = "";
applicationName = "";
@ -106,7 +103,6 @@ public class ApplicationSubmissionContextInfo {
logAggregationContextInfo = null;
attemptFailuresValidityInterval = -1;
reservationId = "";
amBlackListingRequestInfo = null;
}
public String getApplicationId() {
@ -173,10 +169,6 @@ public class ApplicationSubmissionContextInfo {
return attemptFailuresValidityInterval;
}
public AMBlackListingRequestInfo getAMBlackListingRequestInfo() {
return amBlackListingRequestInfo;
}
public String getReservationId() {
return reservationId;
}
@ -252,9 +244,4 @@ public class ApplicationSubmissionContextInfo {
public void setReservationId(String reservationId) {
this.reservationId = reservationId;
}
public void setAMBlackListingRequestInfo(
AMBlackListingRequestInfo amBlackListingRequestInfo) {
this.amBlackListingRequestInfo = amBlackListingRequestInfo;
}
}

View File

@ -0,0 +1,251 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.resourcemanager;
import java.util.List;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.DrainDispatcher;
import org.apache.hadoop.yarn.event.EventDispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.server.resourcemanager.applicationsmanager.TestAMRestart;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.resource.Resources;
import org.junit.Assert;
import org.junit.Test;
/**
* Validate system behavior when the am-scheduling logic 'blacklists' a node for
* an application because of AM failures.
*/
public class TestNodeBlacklistingOnAMFailures {
@Test(timeout = 100000)
public void testNodeBlacklistingOnAMFailure() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
ResourceScheduler.class);
conf.setBoolean(YarnConfiguration.AM_SCHEDULING_NODE_BLACKLISTING_ENABLED,
true);
DrainDispatcher dispatcher = new DrainDispatcher();
MockRM rm = startRM(conf, dispatcher);
CapacityScheduler scheduler = (CapacityScheduler) rm.getResourceScheduler();
MockNM nm1 =
new MockNM("127.0.0.1:1234", 8000, rm.getResourceTrackerService());
nm1.registerNode();
MockNM nm2 =
new MockNM("127.0.0.2:2345", 8000, rm.getResourceTrackerService());
nm2.registerNode();
RMApp app = rm.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app, rm, nm1);
ContainerId amContainerId =
ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
RMContainer rmContainer = scheduler.getRMContainer(amContainerId);
NodeId nodeWhereAMRan = rmContainer.getAllocatedNode();
MockNM currentNode, otherNode;
if (nodeWhereAMRan.equals(nm1.getNodeId())) {
currentNode = nm1;
otherNode = nm2;
} else {
currentNode = nm2;
otherNode = nm1;
}
// Set the exist status to INVALID so that we can verify that the system
// automatically blacklisting the node
makeAMContainerExit(rm, amContainerId, currentNode,
ContainerExitStatus.INVALID);
// restart the am
RMAppAttempt attempt = MockRM.waitForAttemptScheduled(app, rm);
System.out.println("New AppAttempt launched " + attempt.getAppAttemptId());
// Try the current node a few times
for (int i = 0; i <= 2; i++) {
currentNode.nodeHeartbeat(true);
dispatcher.await();
Assert.assertEquals(
"AppAttemptState should still be SCHEDULED if currentNode is "
+ "blacklisted correctly", RMAppAttemptState.SCHEDULED,
attempt.getAppAttemptState());
}
// Now try the other node
otherNode.nodeHeartbeat(true);
dispatcher.await();
// Now the AM container should be allocated
MockRM.waitForState(attempt, RMAppAttemptState.ALLOCATED, 20000);
MockAM am2 = rm.sendAMLaunched(attempt.getAppAttemptId());
rm.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.LAUNCHED);
amContainerId =
ContainerId.newContainerId(am2.getApplicationAttemptId(), 1);
rmContainer = scheduler.getRMContainer(amContainerId);
nodeWhereAMRan = rmContainer.getAllocatedNode();
// The other node should now receive the assignment
Assert.assertEquals(
"After blacklisting, AM should have run on the other node",
otherNode.getNodeId(), nodeWhereAMRan);
am2.registerAppAttempt();
rm.waitForState(app.getApplicationId(), RMAppState.RUNNING);
List<Container> allocatedContainers =
TestAMRestart.allocateContainers(currentNode, am2, 1);
Assert.assertEquals(
"Even though AM is blacklisted from the node, application can "
+ "still allocate non-AM containers there",
currentNode.getNodeId(), allocatedContainers.get(0).getNodeId());
}
@Test(timeout = 100000)
public void testNoBlacklistingForNonSystemErrors() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setBoolean(YarnConfiguration.AM_SCHEDULING_NODE_BLACKLISTING_ENABLED,
true);
// disable the float so it is possible to blacklist the entire cluster
conf.setFloat(
YarnConfiguration.AM_SCHEDULING_NODE_BLACKLISTING_DISABLE_THRESHOLD,
1.5f);
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 100);
DrainDispatcher dispatcher = new DrainDispatcher();
MockRM rm = startRM(conf, dispatcher);
MockNM node =
new MockNM("127.0.0.1:1234", 8000, rm.getResourceTrackerService());
node.registerNode();
RMApp app = rm.submitApp(200);
ApplicationId appId = app.getApplicationId();
int numAppAttempts = 1;
// Now the AM container should be allocated
RMAppAttempt attempt = MockRM.waitForAttemptScheduled(app, rm);
node.nodeHeartbeat(true);
dispatcher.await();
MockRM.waitForState(attempt, RMAppAttemptState.ALLOCATED, 20000);
rm.sendAMLaunched(attempt.getAppAttemptId());
rm.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.LAUNCHED);
ApplicationAttemptId appAttemptId =
ApplicationAttemptId.newInstance(appId, numAppAttempts);
ContainerId amContainerId = ContainerId.newContainerId(appAttemptId, 1);
for (int containerExitStatus : new int[] {
ContainerExitStatus.PREEMPTED,
ContainerExitStatus.KILLED_BY_RESOURCEMANAGER,
// ContainerExitStatus.KILLED_BY_APPMASTER,
ContainerExitStatus.KILLED_AFTER_APP_COMPLETION,
ContainerExitStatus.ABORTED, ContainerExitStatus.DISKS_FAILED,
ContainerExitStatus.KILLED_EXCEEDED_VMEM,
ContainerExitStatus.KILLED_EXCEEDED_PMEM }) {
// Set the exist status to be containerExitStatus so that we can verify
// that the system automatically blacklisting the node
makeAMContainerExit(rm, amContainerId, node, containerExitStatus);
// restart the am
attempt = MockRM.waitForAttemptScheduled(app, rm);
System.out
.println("New AppAttempt launched " + attempt.getAppAttemptId());
node.nodeHeartbeat(true);
dispatcher.await();
MockRM.waitForState(attempt, RMAppAttemptState.ALLOCATED, 20000);
rm.sendAMLaunched(attempt.getAppAttemptId());
rm.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.LAUNCHED);
numAppAttempts++;
appAttemptId = ApplicationAttemptId.newInstance(appId, numAppAttempts);
amContainerId = ContainerId.newContainerId(appAttemptId, 1);
rm.waitForState(node, amContainerId, RMContainerState.ACQUIRED);
}
}
private void makeAMContainerExit(MockRM rm, ContainerId amContainer,
MockNM node, int exitStatus) throws Exception, InterruptedException {
ContainerStatus containerStatus =
BuilderUtils.newContainerStatus(amContainer, ContainerState.COMPLETE,
"", exitStatus, Resources.createResource(200));
node.containerStatus(containerStatus);
ApplicationAttemptId amAttemptID = amContainer.getApplicationAttemptId();
rm.waitForState(amAttemptID, RMAppAttemptState.FAILED);
rm.waitForState(amAttemptID.getApplicationId(), RMAppState.ACCEPTED);
}
private MockRM startRM(YarnConfiguration conf,
final DrainDispatcher dispatcher) {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm1 = new MockRM(conf, memStore) {
@Override
protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
return new EventDispatcher<SchedulerEvent>(this.scheduler,
this.scheduler.getClass().getName()) {
@Override
public void handle(SchedulerEvent event) {
super.handle(event);
}
};
}
@Override
protected Dispatcher createDispatcher() {
return dispatcher;
}
};
rm1.start();
return rm1;
}
}

View File

@ -35,13 +35,8 @@ import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NMToken;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.DrainDispatcher;
import org.apache.hadoop.yarn.event.EventDispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
@ -54,18 +49,14 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.util.ControlledClock;
import org.apache.hadoop.yarn.util.Records;
import org.apache.hadoop.yarn.util.resource.Resources;
import org.junit.Assert;
import org.junit.Test;
@ -238,7 +229,7 @@ public class TestAMRestart {
rm1.stop();
}
private List<Container> allocateContainers(MockNM nm1, MockAM am1,
public static List<Container> allocateContainers(MockNM nm1, MockAM am1,
int NUM_CONTAINERS) throws Exception {
// allocate NUM_CONTAINERS containers
am1.allocate("127.0.0.1", 1024, NUM_CONTAINERS,
@ -277,7 +268,9 @@ public class TestAMRestart {
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 3);
// To prevent test from blacklisting nm1 for AM, we sit threshold to half
// of 2 nodes which is 1
conf.setFloat(YarnConfiguration.AM_BLACKLISTING_DISABLE_THRESHOLD, 0.5f);
conf.setFloat(
YarnConfiguration.AM_SCHEDULING_NODE_BLACKLISTING_DISABLE_THRESHOLD,
0.5f);
MockRM rm1 = new MockRM(conf);
rm1.start();
@ -379,168 +372,6 @@ public class TestAMRestart {
rm1.stop();
}
@Test(timeout = 100000)
public void testAMBlacklistPreventsRestartOnSameNode() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
ResourceScheduler.class);
conf.setBoolean(YarnConfiguration.AM_BLACKLISTING_ENABLED, true);
testAMBlacklistPreventRestartOnSameNode(false, conf);
}
@Test(timeout = 100000)
public void testAMBlacklistPreventsRestartOnSameNodeForMinicluster()
throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setBoolean(YarnConfiguration.AM_BLACKLISTING_ENABLED, true);
conf.setBoolean(YarnConfiguration.RM_SCHEDULER_INCLUDE_PORT_IN_NODE_NAME,
true);
testAMBlacklistPreventRestartOnSameNode(false, conf);
}
@Test(timeout = 100000)
public void testAMBlacklistPreemption() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setBoolean(YarnConfiguration.AM_BLACKLISTING_ENABLED, true);
// disable the float so it is possible to blacklist the entire cluster
conf.setFloat(YarnConfiguration.AM_BLACKLISTING_DISABLE_THRESHOLD, 1.5f);
// since the exit status is PREEMPTED, it should not lead to the node being
// blacklisted
testAMBlacklistPreventRestartOnSameNode(true, conf);
}
/**
* Tests AM blacklisting. In the multi-node mode (i.e. singleNode = false),
* it tests the blacklisting behavior so that the AM container gets allocated
* on the node that is not blacklisted. In the single-node mode, it tests the
* PREEMPTED status to see if the AM container can continue to be scheduled.
*/
private void testAMBlacklistPreventRestartOnSameNode(boolean singleNode,
YarnConfiguration conf) throws Exception {
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
final DrainDispatcher dispatcher = new DrainDispatcher();
MockRM rm1 = new MockRM(conf, memStore) {
@Override
protected EventHandler<SchedulerEvent> createSchedulerEventDispatcher() {
return new EventDispatcher<SchedulerEvent>(this.scheduler,
this.scheduler.getClass().getName()) {
@Override
public void handle(SchedulerEvent event) {
super.handle(event);
}
};
}
@Override
protected Dispatcher createDispatcher() {
return dispatcher;
}
};
rm1.start();
MockNM nm1 =
new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
nm1.registerNode();
MockNM nm2 = null;
if (!singleNode) {
nm2 =
new MockNM("127.0.0.2:2345", 8000, rm1.getResourceTrackerService());
nm2.registerNode();
}
RMApp app1 = rm1.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
CapacityScheduler scheduler =
(CapacityScheduler) rm1.getResourceScheduler();
ContainerId amContainer =
ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
// Preempt the first attempt;
RMContainer rmContainer = scheduler.getRMContainer(amContainer);
NodeId nodeWhereAMRan = rmContainer.getAllocatedNode();
MockNM currentNode, otherNode;
if (singleNode) {
Assert.assertEquals(nm1.getNodeId(), nodeWhereAMRan);
currentNode = nm1;
otherNode = null; // not applicable
} else {
if (nodeWhereAMRan == nm1.getNodeId()) {
currentNode = nm1;
otherNode = nm2;
} else {
currentNode = nm2;
otherNode = nm1;
}
}
// set the exist status to test
// any status other than SUCCESS and PREEMPTED should cause the node to be
// blacklisted
int exitStatus = singleNode ?
ContainerExitStatus.PREEMPTED :
ContainerExitStatus.INVALID;
ContainerStatus containerStatus =
BuilderUtils.newContainerStatus(amContainer, ContainerState.COMPLETE,
"", exitStatus, Resources.createResource(200));
currentNode.containerStatus(containerStatus);
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
// restart the am
RMAppAttempt attempt = MockRM.waitForAttemptScheduled(app1, rm1);
System.out.println("Launch AM " + attempt.getAppAttemptId());
currentNode.nodeHeartbeat(true);
dispatcher.await();
if (!singleNode) {
Assert.assertEquals(
"AppAttemptState should still be SCHEDULED if currentNode is " +
"blacklisted correctly",
RMAppAttemptState.SCHEDULED,
attempt.getAppAttemptState());
otherNode.nodeHeartbeat(true);
dispatcher.await();
}
MockAM am2 = rm1.sendAMLaunched(attempt.getAppAttemptId());
rm1.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.LAUNCHED);
amContainer =
ContainerId.newContainerId(am2.getApplicationAttemptId(), 1);
rmContainer = scheduler.getRMContainer(amContainer);
nodeWhereAMRan = rmContainer.getAllocatedNode();
if (singleNode) {
// with preemption, the node should not be blacklisted and should get the
// assignment (with a single node)
Assert.assertEquals(
"AM should still have been able to run on the same node",
currentNode.getNodeId(), nodeWhereAMRan);
} else {
// with a failed status, the other node should receive the assignment
Assert.assertEquals(
"After blacklisting AM should have run on the other node",
otherNode.getNodeId(), nodeWhereAMRan);
am2.registerAppAttempt();
rm1.waitForState(app1.getApplicationId(), RMAppState.RUNNING);
List<Container> allocatedContainers =
allocateContainers(currentNode, am2, 1);
Assert.assertEquals(
"Even though AM is blacklisted from the node, application can " +
"still allocate containers there",
currentNode.getNodeId(), allocatedContainers.get(0).getNodeId());
}
}
// AM container preempted, nm disk failure
// should not be counted towards AM max retry count.
@Test(timeout = 100000)

View File

@ -19,12 +19,13 @@
package org.apache.hadoop.yarn.server.resourcemanager.blacklist;
import org.junit.Assert;
import org.junit.Test;
import java.util.Collections;
import java.util.List;
import org.apache.hadoop.yarn.api.records.ResourceBlacklistRequest;
import org.junit.Assert;
import org.junit.Test;
public class TestBlacklistManager {
@Test
@ -37,12 +38,12 @@ public class TestBlacklistManager {
String anyNode2 = "bar";
manager.addNode(anyNode);
manager.addNode(anyNode2);
BlacklistUpdates blacklist = manager
ResourceBlacklistRequest blacklist = manager
.getBlacklistUpdates();
List<String> blacklistAdditions = blacklist.getAdditions();
List<String> blacklistAdditions = blacklist.getBlacklistAdditions();
Collections.sort(blacklistAdditions);
List<String> blacklistRemovals = blacklist.getRemovals();
List<String> blacklistRemovals = blacklist.getBlacklistRemovals();
String[] expectedBlacklistAdditions = new String[]{anyNode2, anyNode};
Assert.assertArrayEquals(
"Blacklist additions was not as expected",
@ -61,12 +62,12 @@ public class TestBlacklistManager {
String anyNode = "foo";
String anyNode2 = "bar";
manager.addNode(anyNode);
BlacklistUpdates blacklist = manager
ResourceBlacklistRequest blacklist = manager
.getBlacklistUpdates();
List<String> blacklistAdditions = blacklist.getAdditions();
List<String> blacklistAdditions = blacklist.getBlacklistAdditions();
Collections.sort(blacklistAdditions);
List<String> blacklistRemovals = blacklist.getRemovals();
List<String> blacklistRemovals = blacklist.getBlacklistRemovals();
String[] expectedBlacklistAdditions = new String[]{anyNode};
Assert.assertArrayEquals(
"Blacklist additions was not as expected",
@ -81,9 +82,9 @@ public class TestBlacklistManager {
blacklist = manager
.getBlacklistUpdates();
blacklistAdditions = blacklist.getAdditions();
blacklistAdditions = blacklist.getBlacklistAdditions();
Collections.sort(blacklistAdditions);
blacklistRemovals = blacklist.getRemovals();
blacklistRemovals = blacklist.getBlacklistRemovals();
Collections.sort(blacklistRemovals);
String[] expectedBlacklistRemovals = new String[] {anyNode2, anyNode};
Assert.assertTrue(
@ -101,11 +102,11 @@ public class TestBlacklistManager {
BlacklistManager disabled = new DisabledBlacklistManager();
String anyNode = "foo";
disabled.addNode(anyNode);
BlacklistUpdates blacklist = disabled
ResourceBlacklistRequest blacklist = disabled
.getBlacklistUpdates();
List<String> blacklistAdditions = blacklist.getAdditions();
List<String> blacklistRemovals = blacklist.getRemovals();
List<String> blacklistAdditions = blacklist.getBlacklistAdditions();
List<String> blacklistRemovals = blacklist.getBlacklistRemovals();
Assert.assertTrue(
"Blacklist additions should be empty but was " +
blacklistAdditions,

View File

@ -42,7 +42,6 @@ import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
import org.apache.hadoop.yarn.MockApps;
import org.apache.hadoop.yarn.api.records.AMBlackListingRequest;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
@ -1029,63 +1028,6 @@ public class TestRMAppTransitions {
+ "/"));
}
@Test
public void testAMBlackListConfigFromApp() {
// Scenario 1: Application enables AM blacklisting
float disableThreshold = 0.9f;
conf.setBoolean(YarnConfiguration.AM_BLACKLISTING_ENABLED, false);
ApplicationSubmissionContext submissionContext =
new ApplicationSubmissionContextPBImpl();
submissionContext.setAMBlackListRequest(AMBlackListingRequest.newInstance(
true, disableThreshold));
RMAppImpl application = (RMAppImpl) createNewTestApp(submissionContext);
Assert.assertTrue(application.isAmBlacklistingEnabled());
Assert.assertEquals(disableThreshold,
application.getAmBlacklistingDisableThreshold(), 1e-8);
// Scenario 2: Application disables AM blacklisting
float globalThreshold = 0.9f;
conf.setBoolean(YarnConfiguration.AM_BLACKLISTING_ENABLED, true);
conf.setFloat(YarnConfiguration.AM_BLACKLISTING_DISABLE_THRESHOLD,
globalThreshold);
ApplicationSubmissionContext submissionContext2 =
new ApplicationSubmissionContextPBImpl();
submissionContext2.setAMBlackListRequest(AMBlackListingRequest.newInstance(
false, disableThreshold));
RMAppImpl application2 = (RMAppImpl) createNewTestApp(submissionContext2);
// Am blacklisting will be disabled eventhough its enabled in RM.
Assert.assertFalse(application2.isAmBlacklistingEnabled());
// Scenario 3: Application updates invalid AM threshold
float invalidDisableThreshold = -0.5f;
conf.setBoolean(YarnConfiguration.AM_BLACKLISTING_ENABLED, true);
conf.setFloat(YarnConfiguration.AM_BLACKLISTING_DISABLE_THRESHOLD,
globalThreshold);
ApplicationSubmissionContext submissionContext3 =
new ApplicationSubmissionContextPBImpl();
submissionContext3.setAMBlackListRequest(AMBlackListingRequest.newInstance(
true, invalidDisableThreshold));
RMAppImpl application3 = (RMAppImpl) createNewTestApp(submissionContext3);
Assert.assertTrue(application3.isAmBlacklistingEnabled());
Assert.assertEquals(globalThreshold,
application3.getAmBlacklistingDisableThreshold(), 1e-8);
// Scenario 4: Empty AMBlackListingRequest in Submission Context
conf.setBoolean(YarnConfiguration.AM_BLACKLISTING_ENABLED, true);
conf.setFloat(YarnConfiguration.AM_BLACKLISTING_DISABLE_THRESHOLD,
globalThreshold);
ApplicationSubmissionContext submissionContext4 =
new ApplicationSubmissionContextPBImpl();
RMAppImpl application4 = (RMAppImpl) createNewTestApp(submissionContext4);
Assert.assertTrue(application4.isAmBlacklistingEnabled());
Assert.assertEquals(globalThreshold,
application4.getAmBlacklistingDisableThreshold(), 1e-8);
}
private void verifyApplicationFinished(RMAppState state) {
ArgumentCaptor<RMAppState> finalState =
ArgumentCaptor.forClass(RMAppState.class);

View File

@ -41,32 +41,32 @@ public class TestAppSchedulingInfo {
AppSchedulingInfo appSchedulingInfo = new AppSchedulingInfo(
appAttemptId, "test", queue, null, 0, new ResourceUsage());
appSchedulingInfo.updateBlacklist(new ArrayList<String>(),
appSchedulingInfo.updatePlacesBlacklistedByApp(new ArrayList<String>(),
new ArrayList<String>());
Assert.assertFalse(appSchedulingInfo.getAndResetBlacklistChanged());
ArrayList<String> blacklistAdditions = new ArrayList<String>();
blacklistAdditions.add("node1");
blacklistAdditions.add("node2");
appSchedulingInfo.updateBlacklist(blacklistAdditions,
appSchedulingInfo.updatePlacesBlacklistedByApp(blacklistAdditions,
new ArrayList<String>());
Assert.assertTrue(appSchedulingInfo.getAndResetBlacklistChanged());
blacklistAdditions.clear();
blacklistAdditions.add("node1");
appSchedulingInfo.updateBlacklist(blacklistAdditions,
appSchedulingInfo.updatePlacesBlacklistedByApp(blacklistAdditions,
new ArrayList<String>());
Assert.assertFalse(appSchedulingInfo.getAndResetBlacklistChanged());
ArrayList<String> blacklistRemovals = new ArrayList<String>();
blacklistRemovals.add("node1");
appSchedulingInfo.updateBlacklist(new ArrayList<String>(),
appSchedulingInfo.updatePlacesBlacklistedByApp(new ArrayList<String>(),
blacklistRemovals);
appSchedulingInfo.updateBlacklist(new ArrayList<String>(),
appSchedulingInfo.updatePlacesBlacklistedByApp(new ArrayList<String>(),
blacklistRemovals);
Assert.assertTrue(appSchedulingInfo.getAndResetBlacklistChanged());
appSchedulingInfo.updateBlacklist(new ArrayList<String>(),
appSchedulingInfo.updatePlacesBlacklistedByApp(new ArrayList<String>(),
blacklistRemovals);
Assert.assertFalse(appSchedulingInfo.getAndResetBlacklistChanged());
}

View File

@ -684,14 +684,16 @@ public class TestCapacityScheduler {
cs.allocate(appAttemptId, Collections.<ResourceRequest>emptyList(),
Collections.<ContainerId>emptyList(),
Collections.singletonList(host), null, null, null);
Assert.assertTrue(cs.getApplicationAttempt(appAttemptId).isBlacklisted(host));
Assert.assertTrue(cs.getApplicationAttempt(appAttemptId)
.isPlaceBlacklisted(host));
cs.allocate(appAttemptId, Collections.<ResourceRequest>emptyList(),
Collections.<ContainerId>emptyList(), null,
Collections.singletonList(host), null, null);
Assert.assertFalse(cs.getApplicationAttempt(appAttemptId).isBlacklisted(host));
Assert.assertFalse(cs.getApplicationAttempt(appAttemptId)
.isPlaceBlacklisted(host));
rm.stop();
}
@Test
public void testAllocateReorder() throws Exception {

View File

@ -314,24 +314,24 @@ public class TestFSAppAttempt extends FairSchedulerTestBase {
FSAppAttempt spyApp = spy(app);
doReturn(false)
.when(spyApp).isWaitingForAMContainer();
assertTrue(spyApp.isBlacklisted(n1.getNodeName()));
assertFalse(spyApp.isBlacklisted(n2.getNodeName()));
assertTrue(spyApp.isPlaceBlacklisted(n1.getNodeName()));
assertFalse(spyApp.isPlaceBlacklisted(n2.getNodeName()));
assertEquals(n2.getUnallocatedResource(), spyApp.getHeadroom());
blacklistAdditions.clear();
blacklistAdditions.add(n2.getNodeName());
blacklistRemovals.add(n1.getNodeName());
app.updateBlacklist(blacklistAdditions, blacklistRemovals);
assertFalse(spyApp.isBlacklisted(n1.getNodeName()));
assertTrue(spyApp.isBlacklisted(n2.getNodeName()));
assertFalse(spyApp.isPlaceBlacklisted(n1.getNodeName()));
assertTrue(spyApp.isPlaceBlacklisted(n2.getNodeName()));
assertEquals(n1.getUnallocatedResource(), spyApp.getHeadroom());
blacklistAdditions.clear();
blacklistRemovals.clear();
blacklistRemovals.add(n2.getNodeName());
app.updateBlacklist(blacklistAdditions, blacklistRemovals);
assertFalse(spyApp.isBlacklisted(n1.getNodeName()));
assertFalse(spyApp.isBlacklisted(n2.getNodeName()));
assertFalse(spyApp.isPlaceBlacklisted(n1.getNodeName()));
assertFalse(spyApp.isPlaceBlacklisted(n2.getNodeName()));
assertEquals(clusterResource, spyApp.getHeadroom());
}

View File

@ -3995,11 +3995,12 @@ public class TestFairScheduler extends FairSchedulerTestBase {
scheduler.allocate(appAttemptId, Collections.<ResourceRequest>emptyList(),
Collections.<ContainerId>emptyList(),
Collections.singletonList(host), null, null, null);
assertTrue(app.isBlacklisted(host));
assertTrue(app.isPlaceBlacklisted(host));
scheduler.allocate(appAttemptId, Collections.<ResourceRequest>emptyList(),
Collections.<ContainerId>emptyList(), null,
Collections.singletonList(host), null, null);
assertFalse(scheduler.getSchedulerApp(appAttemptId).isBlacklisted(host));
assertFalse(scheduler.getSchedulerApp(appAttemptId)
.isPlaceBlacklisted(host));
List<ResourceRequest> update = Arrays.asList(
createResourceRequest(GB, node.getHostName(), 1, 0, true));
@ -4008,7 +4009,7 @@ public class TestFairScheduler extends FairSchedulerTestBase {
scheduler.allocate(appAttemptId, update,
Collections.<ContainerId>emptyList(),
Collections.singletonList(host), null, null, null);
assertTrue(app.isBlacklisted(host));
assertTrue(app.isPlaceBlacklisted(host));
scheduler.update();
scheduler.handle(updateEvent);
assertEquals("Incorrect number of containers allocated", 0, app
@ -4018,7 +4019,7 @@ public class TestFairScheduler extends FairSchedulerTestBase {
scheduler.allocate(appAttemptId, update,
Collections.<ContainerId>emptyList(), null,
Collections.singletonList(host), null, null);
assertFalse(app.isBlacklisted(host));
assertFalse(app.isPlaceBlacklisted(host));
createSchedulingRequest(GB, "root.default", "user", 1);
scheduler.update();
scheduler.handle(updateEvent);

View File

@ -22,7 +22,14 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assume.assumeTrue;
import java.io.*;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.StringWriter;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
@ -43,15 +50,12 @@ import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import com.sun.jersey.api.client.config.DefaultClientConfig;
import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.authentication.server.AuthenticationFilter;
import org.apache.hadoop.security.authentication.server.PseudoAuthenticationHandler;
import org.apache.hadoop.yarn.api.records.AMBlackListingRequest;
import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
@ -72,7 +76,13 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.Capacity
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulerConfiguration;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.*;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppPriority;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppQueue;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.AppState;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.ApplicationSubmissionContextInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.CredentialsInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LocalResourceInfo;
import org.apache.hadoop.yarn.server.resourcemanager.webapp.dao.LogAggregationContextInfo;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.webapp.GenericExceptionHandler;
import org.apache.hadoop.yarn.webapp.JerseyTestBase;
@ -101,6 +111,7 @@ import com.sun.jersey.api.client.Client;
import com.sun.jersey.api.client.ClientResponse;
import com.sun.jersey.api.client.ClientResponse.Status;
import com.sun.jersey.api.client.WebResource;
import com.sun.jersey.api.client.config.DefaultClientConfig;
import com.sun.jersey.api.client.filter.LoggingFilter;
import com.sun.jersey.api.json.JSONJAXBContext;
import com.sun.jersey.api.json.JSONMarshaller;
@ -827,17 +838,6 @@ public class TestRMWebServicesAppsModification extends JerseyTestBase {
System.currentTimeMillis(), 1).toString();
appInfo.setReservationId(reservationId);
// Set AMBlackListingRequestInfo
boolean isAMBlackListingEnabled = true;
float disableFailureThreshold = 0.01f;
AMBlackListingRequestInfo amBlackListingRequestInfo
= new AMBlackListingRequestInfo();
amBlackListingRequestInfo.setAMBlackListingEnabled(
isAMBlackListingEnabled);
amBlackListingRequestInfo.setBlackListingDisableFailureThreshold(
disableFailureThreshold);
appInfo.setAMBlackListingRequestInfo(amBlackListingRequestInfo);
ClientResponse response =
this.constructWebResource(urlPath).accept(acceptMedia)
.entity(appInfo, contentMedia).post(ClientResponse.class);
@ -912,13 +912,6 @@ public class TestRMWebServicesAppsModification extends JerseyTestBase {
// Check ReservationId
assertEquals(reservationId, app.getReservationId().toString());
// Check AMBlackListingRequestInfo
AMBlackListingRequest amBlackListingRequest = asc.getAMBlackListRequest();
assertEquals(isAMBlackListingEnabled,
amBlackListingRequest.isAMBlackListingEnabled());
assertTrue(disableFailureThreshold == amBlackListingRequest
.getBlackListingDisableFailureThreshold());
response =
this.constructWebResource("apps", appId).accept(acceptMedia)
.get(ClientResponse.class);