From 1970ca7cbcdb7efa160d0cedc2e3e22c1401fad6 Mon Sep 17 00:00:00 2001 From: Jian He Date: Thu, 4 Jun 2015 11:14:09 -0700 Subject: [PATCH] YARN-2392. Add more diags about app retry limits on AM failures. Contributed by Steve Loughran --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../server/resourcemanager/rmapp/RMAppImpl.java | 16 +++++++++++++--- .../rmapp/attempt/RMAppAttemptImpl.java | 4 ++-- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index a3954e06a96..5a9d8ca8905 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -289,6 +289,9 @@ Release 2.8.0 - UNRELEASED YARN-3467. Expose allocatedMB, allocatedVCores, and runningContainers metrics on running Applications in RM Web UI. (Anubhav Dhoot via kasha) + YARN-2392. Add more diags about app retry limits on AM failures. (Steve + Loughran via jianhe) + OPTIMIZATIONS YARN-3339. TestDockerContainerExecutor should pull a single image and not diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java index f3dacd6a49b..90e63c1d6d3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java @@ -1014,9 +1014,19 @@ public class RMAppImpl implements RMApp, Recoverable { + " failed due to " + failedEvent.getDiagnostics() + ". Failing the application."; } else if (this.isNumAttemptsBeyondThreshold) { - msg = "Application " + this.getApplicationId() + " failed " - + this.maxAppAttempts + " times due to " - + failedEvent.getDiagnostics() + ". Failing the application."; + int globalLimit = conf.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, + YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS); + msg = String.format( + "Application %s failed %d times%s%s due to %s. Failing the application.", + getApplicationId(), + maxAppAttempts, + (attemptFailuresValidityInterval <= 0 ? "" + : (" in previous " + attemptFailuresValidityInterval + + " milliseconds")), + (globalLimit == maxAppAttempts) ? "" + : (" (global limit =" + globalLimit + + "; local limit is =" + maxAppAttempts + ")"), + failedEvent.getDiagnostics()); } return msg; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java index 684dde84856..5171bba3dca 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java @@ -1459,9 +1459,9 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable { .append(status.getDiagnostics()); if (this.getTrackingUrl() != null) { diagnosticsBuilder.append("For more detailed output,").append( - " check application tracking page: ").append( + " check the application tracking page: ").append( this.getTrackingUrl()).append( - " Then, click on links to logs of each attempt.\n"); + " Then click on links to logs of each attempt.\n"); } return diagnosticsBuilder.toString(); }