diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index d67ef2a6e2d..bbb7f07f384 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -378,6 +378,8 @@ Release 2.8.0 - UNRELEASED YARN-4145. Make RMHATestBase abstract so its not run when running all tests under that namespace (adhoot via rkanter) + YARN-2005. Blacklisting support for scheduling AMs. (Anubhav Dhoot via kasha) + OPTIMIZATIONS YARN-3339. TestDockerContainerExecutor should pull a single image and not diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 182be8e6a71..cfe440d6924 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -2025,6 +2025,15 @@ public class YarnConfiguration extends Configuration { public static final String NM_PROVIDER_CONFIGURED_NODE_LABELS = NM_NODE_LABELS_PROVIDER_PREFIX + "configured-node-labels"; + public static final String AM_BLACKLISTING_ENABLED = + YARN_PREFIX + "am.blacklisting.enabled"; + public static final boolean DEFAULT_AM_BLACKLISTING_ENABLED = true; + + public static final String AM_BLACKLISTING_DISABLE_THRESHOLD = + YARN_PREFIX + "am.blacklisting.disable-failure-threshold"; + public static final float DEFAULT_AM_BLACKLISTING_DISABLE_THRESHOLD = 0.8f; + + public YarnConfiguration() { super(); } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index b76defb712d..bcd64c3e124 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -2293,4 +2293,22 @@ org.apache.hadoop.yarn.server.nodemanager.amrmproxy.DefaultRequestInterceptor + + + Enable/disable blacklisting of hosts for AM based on AM failures on those + hosts. + + yarn.am.blacklisting.enabled + true + + + + + Threshold of ratio number of NodeManager hosts that are allowed to be + blacklisted for AM. Beyond this ratio there is no blacklisting to avoid + danger of blacklisting the entire cluster. + + yarn.am.blacklisting.disable-failure-threshold + 0.8f + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/BlacklistManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/BlacklistManager.java new file mode 100644 index 00000000000..f03b4217e24 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/BlacklistManager.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.blacklist; + +import org.apache.hadoop.classification.InterfaceAudience.Private; + +/** + * Tracks blacklists based on failures reported on nodes. + */ +@Private +public interface BlacklistManager { + + /** + * Report failure of a container on node. + * @param node that has a container failure + */ + void addNode(String node); + + /** + * Get {@link BlacklistUpdates} that indicate which nodes should be + * added or to removed from the blacklist. + * @return {@link BlacklistUpdates} + */ + BlacklistUpdates getBlacklistUpdates(); + + /** + * Refresh the number of nodemanager hosts available for scheduling. + * @param nodeHostCount is the number of node hosts. + */ + void refreshNodeHostCount(int nodeHostCount); +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/BlacklistUpdates.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/BlacklistUpdates.java new file mode 100644 index 00000000000..c76dfb4d538 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/BlacklistUpdates.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.blacklist; + +import org.apache.hadoop.classification.InterfaceAudience.Private; + +import java.util.List; + +/** + * Class to track blacklist additions and removals. + */ +@Private +public class BlacklistUpdates { + + private List additions; + private List removals; + + public BlacklistUpdates(List additions, + List removals) { + this.additions = additions; + this.removals = removals; + } + + public List getAdditions() { + return additions; + } + + public List getRemovals() { + return removals; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/DisabledBlacklistManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/DisabledBlacklistManager.java new file mode 100644 index 00000000000..f155b45aa50 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/DisabledBlacklistManager.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.blacklist; + +import java.util.ArrayList; + +/** + * A {@link BlacklistManager} that returns no blacklists. + */ +public class DisabledBlacklistManager implements BlacklistManager{ + + private static final ArrayList EMPTY_LIST = new ArrayList(); + private BlacklistUpdates noBlacklist = + new BlacklistUpdates(EMPTY_LIST, EMPTY_LIST); + + @Override + public void addNode(String node) { + } + + @Override + public BlacklistUpdates getBlacklistUpdates() { + return noBlacklist; + } + + @Override + public void refreshNodeHostCount(int nodeHostCount) { + // Do nothing + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/SimpleBlacklistManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/SimpleBlacklistManager.java new file mode 100644 index 00000000000..a544ab88e89 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/SimpleBlacklistManager.java @@ -0,0 +1,84 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.blacklist; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Maintains a list of failed nodes and returns that as long as number of + * blacklisted nodes is below a threshold percentage of total nodes. If more + * than threshold number of nodes are marked as failure they all are returned + * as removal from blacklist so previous additions are reversed. + */ +public class SimpleBlacklistManager implements BlacklistManager { + + private int numberOfNodeManagerHosts; + private final double blacklistDisableFailureThreshold; + private final Set blacklistNodes = new HashSet<>(); + private static final ArrayList EMPTY_LIST = new ArrayList<>(); + + private static final Log LOG = LogFactory.getLog(SimpleBlacklistManager.class); + + public SimpleBlacklistManager(int numberOfNodeManagerHosts, + double blacklistDisableFailureThreshold) { + this.numberOfNodeManagerHosts = numberOfNodeManagerHosts; + this.blacklistDisableFailureThreshold = blacklistDisableFailureThreshold; + } + + @Override + public void addNode(String node) { + blacklistNodes.add(node); + } + + @Override + public void refreshNodeHostCount(int nodeHostCount) { + this.numberOfNodeManagerHosts = nodeHostCount; + } + + @Override + public BlacklistUpdates getBlacklistUpdates() { + BlacklistUpdates ret; + List blacklist = new ArrayList<>(blacklistNodes); + final int currentBlacklistSize = blacklist.size(); + final double failureThreshold = this.blacklistDisableFailureThreshold * + numberOfNodeManagerHosts; + if (currentBlacklistSize < failureThreshold) { + if (LOG.isDebugEnabled()) { + LOG.debug("blacklist size " + currentBlacklistSize + " is less than " + + "failure threshold ratio " + blacklistDisableFailureThreshold + + " out of total usable nodes " + numberOfNodeManagerHosts); + } + ret = new BlacklistUpdates(blacklist, EMPTY_LIST); + } else { + if (LOG.isDebugEnabled()) { + LOG.debug("blacklist size " + currentBlacklistSize + " is more than " + + "failure threshold ratio " + blacklistDisableFailureThreshold + + " out of total usable nodes " + numberOfNodeManagerHosts); + } + ret = new BlacklistUpdates(EMPTY_LIST, blacklist); + } + return ret; + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index 2eb74f7829a..7cf39b88855 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -74,6 +74,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMAppManagerEvent; import org.apache.hadoop.yarn.server.resourcemanager.RMAppManagerEventType; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils; +import org.apache.hadoop.yarn.server.resourcemanager.blacklist.BlacklistManager; +import org.apache.hadoop.yarn.server.resourcemanager.blacklist.DisabledBlacklistManager; +import org.apache.hadoop.yarn.server.resourcemanager.blacklist.SimpleBlacklistManager; import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState; import org.apache.hadoop.yarn.server.resourcemanager.recovery.Recoverable; import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData; @@ -133,6 +136,8 @@ public class RMAppImpl implements RMApp, Recoverable { private final Set applicationTags; private final long attemptFailuresValidityInterval; + private final boolean amBlacklistingEnabled; + private final float blacklistDisableThreshold; private Clock systemClock; @@ -456,6 +461,18 @@ public class RMAppImpl implements RMApp, Recoverable { maxLogAggregationDiagnosticsInMemory = conf.getInt( YarnConfiguration.RM_MAX_LOG_AGGREGATION_DIAGNOSTICS_IN_MEMORY, YarnConfiguration.DEFAULT_RM_MAX_LOG_AGGREGATION_DIAGNOSTICS_IN_MEMORY); + + amBlacklistingEnabled = conf.getBoolean( + YarnConfiguration.AM_BLACKLISTING_ENABLED, + YarnConfiguration.DEFAULT_AM_BLACKLISTING_ENABLED); + + if (amBlacklistingEnabled) { + blacklistDisableThreshold = conf.getFloat( + YarnConfiguration.AM_BLACKLISTING_DISABLE_THRESHOLD, + YarnConfiguration.DEFAULT_AM_BLACKLISTING_DISABLE_THRESHOLD); + } else { + blacklistDisableThreshold = 0.0f; + } } @Override @@ -797,6 +814,18 @@ public class RMAppImpl implements RMApp, Recoverable { private void createNewAttempt() { ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(applicationId, attempts.size() + 1); + + BlacklistManager currentAMBlacklist; + if (currentAttempt != null) { + currentAMBlacklist = currentAttempt.getAMBlacklist(); + } else { + if (amBlacklistingEnabled) { + currentAMBlacklist = new SimpleBlacklistManager( + scheduler.getNumClusterNodes(), blacklistDisableThreshold); + } else { + currentAMBlacklist = new DisabledBlacklistManager(); + } + } RMAppAttempt attempt = new RMAppAttemptImpl(appAttemptId, rmContext, scheduler, masterService, submissionContext, conf, @@ -804,7 +833,8 @@ public class RMAppImpl implements RMApp, Recoverable { // previously failed attempts(which should not include Preempted, // hardware error and NM resync) + 1) equal to the max-attempt // limit. - maxAppAttempts == (getNumFailedAppAttempts() + 1), amReq); + maxAppAttempts == (getNumFailedAppAttempts() + 1), amReq, + currentAMBlacklist); attempts.put(appAttemptId, attempt); currentAttempt = attempt; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttempt.java index b85174efcf6..4dd834580b0 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttempt.java @@ -38,6 +38,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.security.AMRMTokenIdentifier; import org.apache.hadoop.yarn.security.client.ClientToAMTokenIdentifier; +import org.apache.hadoop.yarn.server.resourcemanager.blacklist.BlacklistManager; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp; /** @@ -184,6 +185,12 @@ public interface RMAppAttempt extends EventHandler { */ ApplicationResourceUsageReport getApplicationResourceUsageReport(); + /** + * Get the {@link BlacklistManager} that manages blacklists for AM failures + * @return the {@link BlacklistManager} that tracks AM failures. + */ + BlacklistManager getAMBlacklist(); + /** * the start time of the application. * @return the start time of the application. diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 74a4000f08d..629b2a3f9e6 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -36,7 +36,6 @@ import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import javax.crypto.SecretKey; -import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -71,6 +70,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils; import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncherEvent; import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncherEventType; +import org.apache.hadoop.yarn.server.resourcemanager.blacklist.BlacklistManager; +import org.apache.hadoop.yarn.server.resourcemanager.blacklist.BlacklistUpdates; +import org.apache.hadoop.yarn.server.resourcemanager.blacklist.DisabledBlacklistManager; import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateData; import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData; import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore; @@ -182,6 +184,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { private RMAppAttemptMetrics attemptMetrics = null; private ResourceRequest amReq = null; + private BlacklistManager blacklistedNodesForAM = null; private static final StateMachineFactory()); appAttempt.finishedContainersSentToAM.get(nodeId).add( @@ -1708,6 +1747,15 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { } } + private void addAMNodeToBlackList(NodeId nodeId) { + blacklistedNodesForAM.addNode(nodeId.getHost().toString()); + } + + @Override + public BlacklistManager getAMBlacklist() { + return blacklistedNodesForAM; + } + private static void addJustFinishedContainer(RMAppAttemptImpl appAttempt, RMAppAttemptContainerFinishedEvent containerFinishedEvent) { appAttempt.justFinishedContainers.putIfAbsent(containerFinishedEvent diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppSchedulingInfo.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppSchedulingInfo.java index 77ac5b3e640..e318d473df1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppSchedulingInfo.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppSchedulingInfo.java @@ -65,7 +65,8 @@ public class AppSchedulingInfo { new org.apache.hadoop.yarn.server.resourcemanager.resource.Priority.Comparator()); final Map> requests = new ConcurrentHashMap>(); - private Set blacklist = new HashSet(); + private Set userBlacklist = new HashSet<>(); + private Set amBlacklist = new HashSet<>(); //private final ApplicationStore store; private ActiveUsersManager activeUsersManager; @@ -217,21 +218,39 @@ public class AppSchedulingInfo { } /** - * The ApplicationMaster is updating the blacklist + * The ApplicationMaster is updating the userBlacklist used for containers + * other than AMs. * - * @param blacklistAdditions resources to be added to the blacklist - * @param blacklistRemovals resources to be removed from the blacklist + * @param blacklistAdditions resources to be added to the userBlacklist + * @param blacklistRemovals resources to be removed from the userBlacklist */ - synchronized public void updateBlacklist( + public void updateBlacklist( List blacklistAdditions, List blacklistRemovals) { - // Add to blacklist - if (blacklistAdditions != null) { - blacklist.addAll(blacklistAdditions); - } + updateUserOrAMBlacklist(userBlacklist, blacklistAdditions, + blacklistRemovals); + } - // Remove from blacklist - if (blacklistRemovals != null) { - blacklist.removeAll(blacklistRemovals); + /** + * RM is updating blacklist for AM containers. + * @param blacklistAdditions resources to be added to the amBlacklist + * @param blacklistRemovals resources to be added to the amBlacklist + */ + public void updateAMBlacklist( + List blacklistAdditions, List blacklistRemovals) { + updateUserOrAMBlacklist(amBlacklist, blacklistAdditions, + blacklistRemovals); + } + + void updateUserOrAMBlacklist(Set blacklist, + List blacklistAdditions, List blacklistRemovals) { + synchronized (blacklist) { + if (blacklistAdditions != null) { + blacklist.addAll(blacklistAdditions); + } + + if (blacklistRemovals != null) { + blacklist.removeAll(blacklistRemovals); + } } } @@ -263,8 +282,23 @@ public class AppSchedulingInfo { return (request == null) ? null : request.getCapability(); } - public synchronized boolean isBlacklisted(String resourceName) { - return blacklist.contains(resourceName); + /** + * Returns if the node is either blacklisted by the user or the system + * @param resourceName the resourcename + * @param useAMBlacklist true if it should check amBlacklist + * @return true if its blacklisted + */ + public boolean isBlacklisted(String resourceName, + boolean useAMBlacklist) { + if (useAMBlacklist){ + synchronized (amBlacklist) { + return amBlacklist.contains(resourceName); + } + } else { + synchronized (userBlacklist) { + return userBlacklist.contains(resourceName); + } + } } /** @@ -473,19 +507,25 @@ public class AppSchedulingInfo { this.queue = queue; } - public synchronized Set getBlackList() { - return this.blacklist; + public Set getBlackList() { + return this.userBlacklist; } - public synchronized Set getBlackListCopy() { - return new HashSet<>(this.blacklist); + public Set getBlackListCopy() { + synchronized (userBlacklist) { + return new HashSet<>(this.userBlacklist); + } } public synchronized void transferStateFromPreviousAppSchedulingInfo( AppSchedulingInfo appInfo) { // this.priorities = appInfo.getPriorities(); // this.requests = appInfo.getRequests(); - this.blacklist = appInfo.getBlackList(); + // This should not require locking the userBlacklist since it will not be + // used by this instance until after setCurrentAppAttempt. + // Should cleanup this to avoid sharing between instances and can + // then remove getBlacklist as well. + this.userBlacklist = appInfo.getBlackList(); } public synchronized void recoverContainer(RMContainer rmContainer) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java index 48725435c09..b361d15362e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java @@ -470,16 +470,9 @@ public class SchedulerApplicationAttempt implements SchedulableEntity { RMContainer rmContainer = i.next(); Container container = rmContainer.getContainer(); ContainerType containerType = ContainerType.TASK; - // The working knowledge is that masterContainer for AM is null as it - // itself is the master container. - RMAppAttempt appAttempt = - rmContext - .getRMApps() - .get( - container.getId().getApplicationAttemptId() - .getApplicationId()).getCurrentAppAttempt(); - if (appAttempt.getMasterContainer() == null - && appAttempt.getSubmissionContext().getUnmanagedAM() == false) { + boolean isWaitingForAMContainer = isWaitingForAMContainer( + container.getId().getApplicationAttemptId().getApplicationId()); + if (isWaitingForAMContainer) { containerType = ContainerType.APPLICATION_MASTER; } try { @@ -509,6 +502,16 @@ public class SchedulerApplicationAttempt implements SchedulableEntity { return new ContainersAndNMTokensAllocation(returnContainerList, nmTokens); } + public boolean isWaitingForAMContainer(ApplicationId applicationId) { + // The working knowledge is that masterContainer for AM is null as it + // itself is the master container. + RMAppAttempt appAttempt = + rmContext.getRMApps().get(applicationId).getCurrentAppAttempt(); + return (appAttempt != null && appAttempt.getMasterContainer() == null + && appAttempt.getSubmissionContext().getUnmanagedAM() == false); + } + + // Blacklist used for user containers public synchronized void updateBlacklist( List blacklistAdditions, List blacklistRemovals) { if (!isStopped) { @@ -516,9 +519,19 @@ public class SchedulerApplicationAttempt implements SchedulableEntity { blacklistAdditions, blacklistRemovals); } } - + + // Blacklist used for AM containers + public synchronized void updateAMBlacklist( + List blacklistAdditions, List blacklistRemovals) { + if (!isStopped) { + this.appSchedulingInfo.updateAMBlacklist( + blacklistAdditions, blacklistRemovals); + } + } + public boolean isBlacklisted(String resourceName) { - return this.appSchedulingInfo.isBlacklisted(resourceName); + boolean useAMBlacklist = isWaitingForAMContainer(getApplicationId()); + return this.appSchedulingInfo.isBlacklisted(resourceName, useAMBlacklist); } public synchronized int addMissedNonPartitionedRequestSchedulingOpportunity( diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java index a7e9d8cb1e0..dbaccaf3d07 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java @@ -52,7 +52,6 @@ import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerId; -import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Priority; @@ -934,7 +933,13 @@ public class CapacityScheduler extends } } - application.updateBlacklist(blacklistAdditions, blacklistRemovals); + if (application.isWaitingForAMContainer(application.getApplicationId())) { + // Allocate is for AM and update AM blacklist for this + application.updateAMBlacklist( + blacklistAdditions, blacklistRemovals); + } else { + application.updateBlacklist(blacklistAdditions, blacklistRemovals); + } allocation = application.getAllocation(getResourceCalculator(), clusterResource, getMinimumResourceCapability()); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerUtils.java deleted file mode 100644 index 9bece9ba50e..00000000000 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerUtils.java +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica; - -import org.apache.commons.logging.Log; - -public class FiCaSchedulerUtils { - - public static boolean isBlacklisted(FiCaSchedulerApp application, - FiCaSchedulerNode node, Log LOG) { - if (application.isBlacklisted(node.getNodeName())) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skipping 'host' " + node.getNodeName() + - " for " + application.getApplicationId() + - " since it has been blacklisted"); - } - return true; - } - - if (application.isBlacklisted(node.getRackName())) { - if (LOG.isDebugEnabled()) { - LOG.debug("Skipping 'rack' " + node.getRackName() + - " for " + application.getApplicationId() + - " since it has been blacklisted"); - } - return true; - } - - return false; - } - -} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java index 3eefb8f7286..5243fb3a144 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java @@ -955,7 +955,14 @@ public class FairScheduler extends preemptionContainerIds.add(container.getContainerId()); } - application.updateBlacklist(blacklistAdditions, blacklistRemovals); + if (application.isWaitingForAMContainer(application.getApplicationId())) { + // Allocate is for AM and update AM blacklist for this + application.updateAMBlacklist( + blacklistAdditions, blacklistRemovals); + } else { + application.updateBlacklist(blacklistAdditions, blacklistRemovals); + } + ContainersAndNMTokensAllocation allocation = application.pullNewlyAllocatedContainersAndNMTokens(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java index 6b77ceb8dc9..99760df671e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fifo/FifoScheduler.java @@ -352,11 +352,18 @@ public class FifoScheduler extends application.showRequests(); LOG.debug("allocate:" + - " applicationId=" + applicationAttemptId + + " applicationId=" + applicationAttemptId + " #ask=" + ask.size()); } - application.updateBlacklist(blacklistAdditions, blacklistRemovals); + if (application.isWaitingForAMContainer(application.getApplicationId())) { + // Allocate is for AM and update AM blacklist for this + application.updateAMBlacklist( + blacklistAdditions, blacklistRemovals); + } else { + application.updateBlacklist(blacklistAdditions, blacklistRemovals); + } + ContainersAndNMTokensAllocation allocation = application.pullNewlyAllocatedContainersAndNMTokens(); Resource headroom = application.getHeadroom(); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java index 50803550af5..e464401387c 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/MockRM.java @@ -750,10 +750,7 @@ public class MockRM extends ResourceManager { public static MockAM launchAM(RMApp app, MockRM rm, MockNM nm) throws Exception { - rm.waitForState(app.getApplicationId(), RMAppState.ACCEPTED); - RMAppAttempt attempt = app.getCurrentAppAttempt(); - waitForSchedulerAppAttemptAdded(attempt.getAppAttemptId(), rm); - rm.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.SCHEDULED); + RMAppAttempt attempt = waitForAttemptScheduled(app, rm); System.out.println("Launch AM " + attempt.getAppAttemptId()); nm.nodeHeartbeat(true); MockAM am = rm.sendAMLaunched(attempt.getAppAttemptId()); @@ -761,6 +758,15 @@ public class MockRM extends ResourceManager { return am; } + public static RMAppAttempt waitForAttemptScheduled(RMApp app, MockRM rm) + throws Exception { + rm.waitForState(app.getApplicationId(), RMAppState.ACCEPTED); + RMAppAttempt attempt = app.getCurrentAppAttempt(); + waitForSchedulerAppAttemptAdded(attempt.getAppAttemptId(), rm); + rm.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.SCHEDULED); + return attempt; + } + public static MockAM launchAndRegisterAM(RMApp app, MockRM rm, MockNM nm) throws Exception { MockAM am = launchAM(app, rm, nm); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java index d579595113d..dc843b9ee98 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/applicationsmanager/TestAMRestart.java @@ -35,8 +35,12 @@ import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.NMToken; +import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.ResourceRequest; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.event.Dispatcher; +import org.apache.hadoop.yarn.event.DrainDispatcher; +import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus; import org.apache.hadoop.yarn.server.resourcemanager.MockAM; import org.apache.hadoop.yarn.server.resourcemanager.MockNM; @@ -49,11 +53,14 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl; import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState; +import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent; +import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.util.ControlledClock; import org.apache.hadoop.yarn.util.Records; import org.apache.hadoop.yarn.util.SystemClock; @@ -82,21 +89,7 @@ public class TestAMRestart { MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1); int NUM_CONTAINERS = 3; - // allocate NUM_CONTAINERS containers - am1.allocate("127.0.0.1", 1024, NUM_CONTAINERS, - new ArrayList()); - nm1.nodeHeartbeat(true); - - // wait for containers to be allocated. - List containers = - am1.allocate(new ArrayList(), - new ArrayList()).getAllocatedContainers(); - while (containers.size() != NUM_CONTAINERS) { - nm1.nodeHeartbeat(true); - containers.addAll(am1.allocate(new ArrayList(), - new ArrayList()).getAllocatedContainers()); - Thread.sleep(200); - } + allocateContainers(nm1, am1, NUM_CONTAINERS); // launch the 2nd container, for testing running container transferred. nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING); @@ -244,6 +237,29 @@ public class TestAMRestart { rm1.stop(); } + private List allocateContainers(MockNM nm1, MockAM am1, + int NUM_CONTAINERS) throws Exception { + // allocate NUM_CONTAINERS containers + am1.allocate("127.0.0.1", 1024, NUM_CONTAINERS, + new ArrayList()); + nm1.nodeHeartbeat(true); + + // wait for containers to be allocated. + List containers = + am1.allocate(new ArrayList(), + new ArrayList()).getAllocatedContainers(); + while (containers.size() != NUM_CONTAINERS) { + nm1.nodeHeartbeat(true); + containers.addAll(am1.allocate(new ArrayList(), + new ArrayList()).getAllocatedContainers()); + Thread.sleep(200); + } + + Assert.assertEquals("Did not get all containers allocated", + NUM_CONTAINERS, containers.size()); + return containers; + } + private void waitForContainersToFinish(int expectedNum, RMAppAttempt attempt) throws InterruptedException { int count = 0; @@ -258,6 +274,9 @@ public class TestAMRestart { public void testNMTokensRebindOnAMRestart() throws Exception { YarnConfiguration conf = new YarnConfiguration(); conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 3); + // To prevent test from blacklisting nm1 for AM, we sit threshold to half + // of 2 nodes which is 1 + conf.setFloat(YarnConfiguration.AM_BLACKLISTING_DISABLE_THRESHOLD, 0.5f); MockRM rm1 = new MockRM(conf); rm1.start(); @@ -355,6 +374,106 @@ public class TestAMRestart { rm1.stop(); } + @Test(timeout = 100000) + public void testAMBlacklistPreventsRestartOnSameNode() throws Exception { + YarnConfiguration conf = new YarnConfiguration(); + conf.setBoolean(YarnConfiguration.AM_BLACKLISTING_ENABLED, true); + MemoryRMStateStore memStore = new MemoryRMStateStore(); + memStore.init(conf); + final DrainDispatcher dispatcher = new DrainDispatcher(); + MockRM rm1 = new MockRM(conf, memStore) { + @Override + protected EventHandler createSchedulerEventDispatcher() { + return new SchedulerEventDispatcher(this.scheduler) { + @Override + public void handle(SchedulerEvent event) { + scheduler.handle(event); + } + }; + } + + @Override + protected Dispatcher createDispatcher() { + return dispatcher; + } + }; + + rm1.start(); + + MockNM nm1 = + new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService()); + nm1.registerNode(); + + MockNM nm2 = + new MockNM("127.0.0.2:2345", 8000, rm1.getResourceTrackerService()); + nm2.registerNode(); + + RMApp app1 = rm1.submitApp(200); + + MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1); + CapacityScheduler scheduler = + (CapacityScheduler) rm1.getResourceScheduler(); + ContainerId amContainer = + ContainerId.newContainerId(am1.getApplicationAttemptId(), 1); + // Preempt the first attempt; + RMContainer rmContainer = scheduler.getRMContainer(amContainer); + NodeId nodeWhereAMRan = rmContainer.getAllocatedNode(); + + MockNM currentNode, otherNode; + if (nodeWhereAMRan == nm1.getNodeId()) { + currentNode = nm1; + otherNode = nm2; + } else { + currentNode = nm2; + otherNode = nm1; + } + + ContainerStatus containerStatus = + BuilderUtils.newContainerStatus(amContainer, ContainerState.COMPLETE, + "", ContainerExitStatus.DISKS_FAILED); + currentNode.containerStatus(containerStatus); + am1.waitForState(RMAppAttemptState.FAILED); + rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED); + + // restart the am + RMAppAttempt attempt = rm1.waitForAttemptScheduled(app1, rm1); + System.out.println("Launch AM " + attempt.getAppAttemptId()); + + + + currentNode.nodeHeartbeat(true); + dispatcher.await(); + Assert.assertEquals( + "AppAttemptState should still be SCHEDULED if currentNode is " + + "blacklisted correctly", + RMAppAttemptState.SCHEDULED, + attempt.getAppAttemptState()); + + otherNode.nodeHeartbeat(true); + dispatcher.await(); + + MockAM am2 = rm1.sendAMLaunched(attempt.getAppAttemptId()); + rm1.waitForState(attempt.getAppAttemptId(), RMAppAttemptState.LAUNCHED); + + amContainer = + ContainerId.newContainerId(am2.getApplicationAttemptId(), 1); + rmContainer = scheduler.getRMContainer(amContainer); + nodeWhereAMRan = rmContainer.getAllocatedNode(); + Assert.assertEquals( + "After blacklisting AM should have run on the other node", + otherNode.getNodeId(), nodeWhereAMRan); + + am2.registerAppAttempt(); + rm1.waitForState(app1.getApplicationId(), RMAppState.RUNNING); + + List allocatedContainers = + allocateContainers(currentNode, am2, 1); + Assert.assertEquals( + "Even though AM is blacklisted from the node, application can still " + + "allocate containers there", + currentNode.getNodeId(), allocatedContainers.get(0).getNodeId()); + } + // AM container preempted, nm disk failure // should not be counted towards AM max retry count. @Test(timeout = 100000) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/TestBlacklistManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/TestBlacklistManager.java new file mode 100644 index 00000000000..96b373f98c9 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/blacklist/TestBlacklistManager.java @@ -0,0 +1,118 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.resourcemanager.blacklist; + + +import org.junit.Assert; +import org.junit.Test; + +import java.util.Collections; +import java.util.List; + +public class TestBlacklistManager { + + @Test + public void testSimpleBlacklistBelowFailureThreshold() { + final int numberOfNodeManagerHosts = 3; + final double blacklistDisableFailureThreshold = 0.8; + BlacklistManager manager = new SimpleBlacklistManager( + numberOfNodeManagerHosts, blacklistDisableFailureThreshold); + String anyNode = "foo"; + String anyNode2 = "bar"; + manager.addNode(anyNode); + manager.addNode(anyNode2); + BlacklistUpdates blacklist = manager + .getBlacklistUpdates(); + + List blacklistAdditions = blacklist.getAdditions(); + Collections.sort(blacklistAdditions); + List blacklistRemovals = blacklist.getRemovals(); + String[] expectedBlacklistAdditions = new String[]{anyNode2, anyNode}; + Assert.assertArrayEquals( + "Blacklist additions was not as expected", + expectedBlacklistAdditions, + blacklistAdditions.toArray()); + Assert.assertTrue( + "Blacklist removals should be empty but was " + + blacklistRemovals, + blacklistRemovals.isEmpty()); + } + + @Test + public void testSimpleBlacklistAboveFailureThreshold() { + // Create a threshold of 0.5 * 3 i.e at 1.5 node failures. + BlacklistManager manager = new SimpleBlacklistManager(3, 0.5); + String anyNode = "foo"; + String anyNode2 = "bar"; + manager.addNode(anyNode); + BlacklistUpdates blacklist = manager + .getBlacklistUpdates(); + + List blacklistAdditions = blacklist.getAdditions(); + Collections.sort(blacklistAdditions); + List blacklistRemovals = blacklist.getRemovals(); + String[] expectedBlacklistAdditions = new String[]{anyNode}; + Assert.assertArrayEquals( + "Blacklist additions was not as expected", + expectedBlacklistAdditions, + blacklistAdditions.toArray()); + Assert.assertTrue( + "Blacklist removals should be empty but was " + + blacklistRemovals, + blacklistRemovals.isEmpty()); + + manager.addNode(anyNode2); + + blacklist = manager + .getBlacklistUpdates(); + blacklistAdditions = blacklist.getAdditions(); + Collections.sort(blacklistAdditions); + blacklistRemovals = blacklist.getRemovals(); + Collections.sort(blacklistRemovals); + String[] expectedBlacklistRemovals = new String[] {anyNode2, anyNode}; + Assert.assertTrue( + "Blacklist additions should be empty but was " + + blacklistAdditions, + blacklistAdditions.isEmpty()); + Assert.assertArrayEquals( + "Blacklist removals was not as expected", + expectedBlacklistRemovals, + blacklistRemovals.toArray()); + } + + @Test + public void testDisabledBlacklist() { + BlacklistManager disabled = new DisabledBlacklistManager(); + String anyNode = "foo"; + disabled.addNode(anyNode); + BlacklistUpdates blacklist = disabled + .getBlacklistUpdates(); + + List blacklistAdditions = blacklist.getAdditions(); + List blacklistRemovals = blacklist.getRemovals(); + Assert.assertTrue( + "Blacklist additions should be empty but was " + + blacklistAdditions, + blacklistAdditions.isEmpty()); + Assert.assertTrue( + "Blacklist removals should be empty but was " + + blacklistRemovals, + blacklistRemovals.isEmpty()); + } +} diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/logaggregationstatus/TestRMAppLogAggregationStatus.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/logaggregationstatus/TestRMAppLogAggregationStatus.java index fccfa19e19f..484a1b627f3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/logaggregationstatus/TestRMAppLogAggregationStatus.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/logaggregationstatus/TestRMAppLogAggregationStatus.java @@ -489,7 +489,7 @@ public class TestRMAppLogAggregationStatus { 2, Resource.newInstance(10, 2), "test"); return new RMAppImpl(this.appId, this.rmContext, conf, "test", "test", "default", submissionContext, - this.rmContext.getScheduler(), + scheduler, this.rmContext.getApplicationMasterService(), System.currentTimeMillis(), "test", null, null); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java index 2e64d61b08a..a5e3308f565 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java @@ -970,7 +970,7 @@ public class TestRMAppTransitions { appState.getApplicationSubmissionContext().getApplicationId(), rmContext, conf, submissionContext.getApplicationName(), null, - submissionContext.getQueue(), submissionContext, null, null, + submissionContext.getQueue(), submissionContext, scheduler, null, appState.getSubmitTime(), submissionContext.getApplicationType(), submissionContext.getApplicationTags(), BuilderUtils.newResourceRequest( diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java index 44773be8ce2..76a1351e56d 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java @@ -56,6 +56,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationAccessType; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationResourceUsageReport; +import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerState; @@ -655,6 +656,11 @@ public class TestCapacityScheduler { RMAppImpl app = mock(RMAppImpl.class); when(app.getApplicationId()).thenReturn(appId); RMAppAttemptImpl attempt = mock(RMAppAttemptImpl.class); + Container container = mock(Container.class); + when(attempt.getMasterContainer()).thenReturn(container); + ApplicationSubmissionContext submissionContext = mock( + ApplicationSubmissionContext.class); + when(attempt.getSubmissionContext()).thenReturn(submissionContext); when(attempt.getAppAttemptId()).thenReturn(appAttemptId); when(attempt.getRMAppAttemptMetrics()).thenReturn(attemptMetric); when(app.getCurrentAppAttempt()).thenReturn(attempt); @@ -715,6 +721,11 @@ public class TestCapacityScheduler { RMAppImpl app1 = mock(RMAppImpl.class); when(app1.getApplicationId()).thenReturn(appId1); RMAppAttemptImpl attempt1 = mock(RMAppAttemptImpl.class); + Container container = mock(Container.class); + when(attempt1.getMasterContainer()).thenReturn(container); + ApplicationSubmissionContext submissionContext = mock( + ApplicationSubmissionContext.class); + when(attempt1.getSubmissionContext()).thenReturn(submissionContext); when(attempt1.getAppAttemptId()).thenReturn(appAttemptId1); when(attempt1.getRMAppAttemptMetrics()).thenReturn(attemptMetric1); when(app1.getCurrentAppAttempt()).thenReturn(attempt1); @@ -739,6 +750,8 @@ public class TestCapacityScheduler { RMAppImpl app2 = mock(RMAppImpl.class); when(app2.getApplicationId()).thenReturn(appId2); RMAppAttemptImpl attempt2 = mock(RMAppAttemptImpl.class); + when(attempt2.getMasterContainer()).thenReturn(container); + when(attempt2.getSubmissionContext()).thenReturn(submissionContext); when(attempt2.getAppAttemptId()).thenReturn(appAttemptId2); when(attempt2.getRMAppAttemptMetrics()).thenReturn(attemptMetric2); when(app2.getCurrentAppAttempt()).thenReturn(attempt2); @@ -2876,6 +2889,11 @@ public class TestCapacityScheduler { RMAppImpl app = mock(RMAppImpl.class); when(app.getApplicationId()).thenReturn(appId); RMAppAttemptImpl attempt = mock(RMAppAttemptImpl.class); + Container container = mock(Container.class); + when(attempt.getMasterContainer()).thenReturn(container); + ApplicationSubmissionContext submissionContext = mock( + ApplicationSubmissionContext.class); + when(attempt.getSubmissionContext()).thenReturn(submissionContext); when(attempt.getAppAttemptId()).thenReturn(appAttemptId); when(attempt.getRMAppAttemptMetrics()).thenReturn(attemptMetric); when(app.getCurrentAppAttempt()).thenReturn(attempt); @@ -2953,6 +2971,11 @@ public class TestCapacityScheduler { RMAppImpl app = mock(RMAppImpl.class); when(app.getApplicationId()).thenReturn(appId); RMAppAttemptImpl attempt = mock(RMAppAttemptImpl.class); + Container container = mock(Container.class); + when(attempt.getMasterContainer()).thenReturn(container); + ApplicationSubmissionContext submissionContext = mock( + ApplicationSubmissionContext.class); + when(attempt.getSubmissionContext()).thenReturn(submissionContext); when(attempt.getAppAttemptId()).thenReturn(appAttemptId); when(attempt.getRMAppAttemptMetrics()).thenReturn(attemptMetric); when(app.getCurrentAppAttempt()).thenReturn(attempt); @@ -2976,6 +2999,8 @@ public class TestCapacityScheduler { RMAppImpl app2 = mock(RMAppImpl.class); when(app2.getApplicationId()).thenReturn(appId2); RMAppAttemptImpl attempt2 = mock(RMAppAttemptImpl.class); + when(attempt2.getMasterContainer()).thenReturn(container); + when(attempt2.getSubmissionContext()).thenReturn(submissionContext); when(attempt2.getAppAttemptId()).thenReturn(appAttemptId2); when(attempt2.getRMAppAttemptMetrics()).thenReturn(attemptMetric2); when(app2.getCurrentAppAttempt()).thenReturn(attempt2); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java index 403c8ea313b..1c9801d7631 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerTestBase.java @@ -220,7 +220,7 @@ public class FairSchedulerTestBase { ApplicationId appId = attId.getApplicationId(); RMApp rmApp = new RMAppImpl(appId, rmContext, conf, null, user, null, ApplicationSubmissionContext.newInstance(appId, null, - queue, null, null, false, false, 0, amResource, null), null, null, + queue, null, null, false, false, 0, amResource, null), scheduler, null, 0, null, null, null); rmContext.getRMApps().put(appId, rmApp); RMAppEvent event = new RMAppEvent(appId, RMAppEventType.START);