diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index a8f8633d721..54e74a95944 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -603,6 +603,8 @@ Release 0.23.3 - UNRELEASED MAPREDUCE-4320. gridmix mainClass wrong in pom.xml (tgraves) + MAPREDUCE-4295. RM crashes due to DNS issue (tgraves) + Release 0.23.2 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java index 75d52490d67..aed6c902fb1 100644 --- a/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java +++ b/hadoop-mapreduce-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java @@ -1180,9 +1180,16 @@ public class LeafQueue implements CSQueue { if (UserGroupInformation.isSecurityEnabled()) { ContainerTokenIdentifier tokenIdentifier = new ContainerTokenIdentifier( containerId, nodeId.toString(), capability); - containerToken = BuilderUtils.newContainerToken(nodeId, ByteBuffer - .wrap(containerTokenSecretManager - .createPassword(tokenIdentifier)), tokenIdentifier); + try { + containerToken = BuilderUtils.newContainerToken(nodeId, ByteBuffer + .wrap(containerTokenSecretManager + .createPassword(tokenIdentifier)), tokenIdentifier); + } catch (IllegalArgumentException e) { + // this could be because DNS is down - in which case we just want + // to retry and not bring RM down + LOG.error("Error trying to create new container", e); + return null; + } } // Create the container @@ -1211,6 +1218,11 @@ public class LeafQueue implements CSQueue { // Create the container if necessary Container container = getContainer(rmContainer, application, node, capability, priority); + + // something went wrong getting/creating the container + if (container == null) { + return Resources.none(); + } // Can we allocate a container on this node? int availableContainers =