From 7003997e36014974eee15f2796c5ad7a1436d430 Mon Sep 17 00:00:00 2001 From: Szilard Nemeth Date: Sat, 12 Jun 2021 14:49:52 +0200 Subject: [PATCH] YARN-10789. RM HA startup can fail due to race conditions in ZKConfigurationStore. Contributed by Tarun Parimi --- .../capacity/conf/ZKConfigurationStore.java | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/ZKConfigurationStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/ZKConfigurationStore.java index 7276d3d0143..703193b1b29 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/ZKConfigurationStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/conf/ZKConfigurationStore.java @@ -19,6 +19,7 @@ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.conf; import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting; +import org.apache.zookeeper.KeeperException.NodeExistsException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -61,7 +62,8 @@ public class ZKConfigurationStore extends YarnConfigurationStore { private static final String CONF_STORE_PATH = "CONF_STORE"; private static final String FENCING_PATH = "FENCING"; private static final String CONF_VERSION_PATH = "CONF_VERSION"; - + private static final String NODEEXISTS_MSG = "Encountered NodeExists error." + + " Skipping znode creation since another RM has already created it"; private String zkVersionPath; private String logsPath; private String confStorePath; @@ -92,7 +94,11 @@ public class ZKConfigurationStore extends YarnConfigurationStore { this.fencingNodePath = getNodePath(znodeParentPath, FENCING_PATH); this.confVersionPath = getNodePath(znodeParentPath, CONF_VERSION_PATH); - zkManager.createRootDirRecursively(znodeParentPath, zkAcl); + try { + zkManager.createRootDirRecursively(znodeParentPath, zkAcl); + } catch(NodeExistsException e) { + LOG.warn(NODEEXISTS_MSG, e); + } zkManager.delete(fencingNodePath); if (createNewZkPath(logsPath)) { @@ -244,7 +250,12 @@ public class ZKConfigurationStore extends YarnConfigurationStore { */ private boolean createNewZkPath(String path) throws Exception { if (!zkManager.exists(path)) { - zkManager.create(path); + try { + zkManager.create(path); + } catch(NodeExistsException e) { + LOG.warn(NODEEXISTS_MSG, e); + return false; + } return true; } else { return false; @@ -279,8 +290,12 @@ public class ZKConfigurationStore extends YarnConfigurationStore { @VisibleForTesting protected void safeCreateZkData(String path, byte[] data) throws Exception { - zkManager.safeCreate(path, data, zkAcl, CreateMode.PERSISTENT, - zkAcl, fencingNodePath); + try { + zkManager.safeCreate(path, data, zkAcl, CreateMode.PERSISTENT, + zkAcl, fencingNodePath); + } catch(NodeExistsException e) { + LOG.warn(NODEEXISTS_MSG, e); + } } private static String getNodePath(String root, String nodeName) {