From 7a8b6265c626c45d3af099723f0b1b9b76dd5cb4 Mon Sep 17 00:00:00 2001 From: Szilard Nemeth Date: Thu, 29 Jul 2021 19:22:57 +0200 Subject: [PATCH] YARN-10789. RM HA startup can fail due to race conditions in ZKConfigurationStore. Contributed by Tarun Parimi --- .../capacity/conf/ZKConfigurationStore.java | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/ZKConfigurationStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/ZKConfigurationStore.java index 3742c363fcb..6f3612c28da 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/ZKConfigurationStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/ZKConfigurationStore.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.conf; import com.google.common.annotations.VisibleForTesting; +import org.apache.zookeeper.KeeperException.NodeExistsException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -62,7 +63,8 @@ public class ZKConfigurationStore extends YarnConfigurationStore { private static final String CONF_STORE_PATH = "CONF_STORE"; private static final String FENCING_PATH = "FENCING"; private static final String CONF_VERSION_PATH = "CONF_VERSION"; - + private static final String NODEEXISTS_MSG = "Encountered NodeExists error." + + " Skipping znode creation since another RM has already created it"; private String zkVersionPath; private String logsPath; private String confStorePath; @@ -93,7 +95,11 @@ public class ZKConfigurationStore extends YarnConfigurationStore { this.fencingNodePath = getNodePath(znodeParentPath, FENCING_PATH); this.confVersionPath = getNodePath(znodeParentPath, CONF_VERSION_PATH); - zkManager.createRootDirRecursively(znodeParentPath, zkAcl); + try { + zkManager.createRootDirRecursively(znodeParentPath, zkAcl); + } catch(NodeExistsException e) { + LOG.warn(NODEEXISTS_MSG, e); + } zkManager.delete(fencingNodePath); if (createNewZkPath(logsPath)) { @@ -247,7 +253,12 @@ public class ZKConfigurationStore extends YarnConfigurationStore { */ private boolean createNewZkPath(String path) throws Exception { if (!zkManager.exists(path)) { - zkManager.create(path); + try { + zkManager.create(path); + } catch(NodeExistsException e) { + LOG.warn(NODEEXISTS_MSG, e); + return false; + } return true; } else { return false; @@ -282,8 +293,12 @@ public class ZKConfigurationStore extends YarnConfigurationStore { @VisibleForTesting protected void safeCreateZkData(String path, byte[] data) throws Exception { - zkManager.safeCreate(path, data, zkAcl, CreateMode.PERSISTENT, - zkAcl, fencingNodePath); + try { + zkManager.safeCreate(path, data, zkAcl, CreateMode.PERSISTENT, + zkAcl, fencingNodePath); + } catch(NodeExistsException e) { + LOG.warn(NODEEXISTS_MSG, e); + } } private static String getNodePath(String root, String nodeName) {