From ca0349b87ab1b2d0d2b9dc93de7806d26713165c Mon Sep 17 00:00:00 2001 From: Jian He Date: Fri, 26 Dec 2014 17:59:58 -0800 Subject: [PATCH] YARN-2992. ZKRMStateStore crashes due to session expiry. Contributed by Karthik Kambatla (cherry picked from commit 1454efe5d4fe4214ec5ef9142d55dbeca7dab953) --- hadoop-yarn-project/CHANGES.txt | 3 +++ .../yarn/server/resourcemanager/recovery/ZKRMStateStore.java | 3 +++ 2 files changed, 6 insertions(+) diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 1098eaca93d..1c8c23718b3 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -257,6 +257,9 @@ Release 2.7.0 - UNRELEASED YARN-2988. Graph#save() may leak file descriptors. (Ted Yu via ozawa) + YARN-2992. ZKRMStateStore crashes due to session expiry. (Karthik Kambatla + via jianhe) + Release 2.6.0 - 2014-11-18 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java index e559e4c41e1..285f3d0be54 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/recovery/ZKRMStateStore.java @@ -1056,6 +1056,8 @@ public class ZKRMStateStore extends RMStateStore { switch (code) { case CONNECTIONLOSS: case OPERATIONTIMEOUT: + case SESSIONEXPIRED: + case SESSIONMOVED: return true; default: break; @@ -1084,6 +1086,7 @@ public class ZKRMStateStore extends RMStateStore { if (shouldRetry(ke.code()) && ++retry < numRetries) { LOG.info("Retrying operation on ZK. Retry no. " + retry); Thread.sleep(zkRetryInterval); + createConnection(); continue; } LOG.info("Maxed out ZK retries. Giving up!");