From 0af6a99a3fcfa4b47d3bcba5e5cc5fe7b312a152 Mon Sep 17 00:00:00 2001 From: Jian He Date: Mon, 9 Feb 2015 13:47:08 -0800 Subject: [PATCH] YARN-3094. Reset timer for liveness monitors after RM recovery. Contributed by Jun Gong --- hadoop-yarn-project/CHANGES.txt | 3 + .../yarn/util/AbstractLivelinessMonitor.java | 8 ++ .../resourcemanager/ResourceManager.java | 2 + .../rmapp/attempt/AMLivelinessMonitor.java | 6 ++ .../attempt/TestAMLivelinessMonitor.java | 81 +++++++++++++++++++ 5 files changed, 100 insertions(+) create mode 100644 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestAMLivelinessMonitor.java diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index cc2f34b041b..578a8cc2886 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -516,6 +516,9 @@ Release 2.7.0 - UNRELEASED YARN-3143. RM Apps REST API can return NPE or entries missing id and other fields (jlowe) + YARN-3094. Reset timer for liveness monitors after RM recovery. (Jun Gong + via jianhe) + Release 2.6.0 - 2014-11-18 INCOMPATIBLE CHANGES diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java index c1825319a73..4f587b348cf 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/util/AbstractLivelinessMonitor.java @@ -59,6 +59,7 @@ public abstract class AbstractLivelinessMonitor extends AbstractService { @Override protected void serviceStart() throws Exception { assert !stopped : "starting when already stopped"; + resetTimer(); checkerThread = new Thread(new PingChecker()); checkerThread.setName("Ping Checker"); checkerThread.start(); @@ -99,6 +100,13 @@ public abstract class AbstractLivelinessMonitor extends AbstractService { running.remove(ob); } + public synchronized void resetTimer() { + long time = clock.getTime(); + for (O ob : running.keySet()) { + running.put(ob, time); + } + } + private class PingChecker implements Runnable { @Override diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java index 4f242e93ae4..a93372a72fe 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java @@ -564,12 +564,14 @@ public class ResourceManager extends CompositeService implements Recoverable { if(recoveryEnabled) { try { + LOG.info("Recovery started"); rmStore.checkVersion(); if (rmContext.isWorkPreservingRecoveryEnabled()) { rmContext.setEpoch(rmStore.getAndIncrementEpoch()); } RMState state = rmStore.loadState(); recover(state); + LOG.info("Recovery ended"); } catch (Exception e) { // the Exception from loadState() needs to be handled for // HA and we need to give up master status if we got fenced diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java index 2c1f7f1f03e..76331bf7fec 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/AMLivelinessMonitor.java @@ -24,6 +24,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.util.AbstractLivelinessMonitor; +import org.apache.hadoop.yarn.util.Clock; import org.apache.hadoop.yarn.util.SystemClock; public class AMLivelinessMonitor extends AbstractLivelinessMonitor { @@ -35,6 +36,11 @@ public class AMLivelinessMonitor extends AbstractLivelinessMonitor