From 964c1902c8054dfe13c787222a12fb0daf1aaab9 Mon Sep 17 00:00:00 2001 From: Ashutosh Gupta Date: Fri, 21 Apr 2023 13:03:22 +0100 Subject: [PATCH] YARN-11463. Node Labels root directory creation doesn't have a retry logic (#5562) Co-authored-by: Ashutosh Gupta --- .../hadoop/yarn/conf/YarnConfiguration.java | 10 +++++++ .../nodelabels/store/AbstractFSNodeStore.java | 28 +++++++++++++++++-- .../src/main/resources/yarn-default.xml | 16 +++++++++++ .../TestFileSystemNodeLabelsStore.java | 3 -- 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java index 6d77eb492dc..a3faec7171b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java @@ -217,6 +217,16 @@ public class YarnConfiguration extends Configuration { public static final int DEFAULT_RM_APPLICATION_MAX_TAG_LENGTH = 100; + public static final String NODE_STORE_ROOT_DIR_NUM_RETRIES = + RM_PREFIX + "nodestore-rootdir.num-retries"; + + public static final int NODE_STORE_ROOT_DIR_NUM_DEFAULT_RETRIES = 1000; + + public static final String NODE_STORE_ROOT_DIR_RETRY_INTERVAL = + RM_PREFIX + "nodestore-rootdir.retry-interval-ms"; + + public static final int NODE_STORE_ROOT_DIR_RETRY_DEFAULT_INTERVAL = 1000; + public static final String RM_APPLICATION_MASTER_SERVICE_PROCESSORS = RM_PREFIX + "application-master-service.processors"; diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/nodelabels/store/AbstractFSNodeStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/nodelabels/store/AbstractFSNodeStore.java index 81514942af0..a697be19512 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/nodelabels/store/AbstractFSNodeStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/nodelabels/store/AbstractFSNodeStore.java @@ -65,8 +65,32 @@ public abstract class AbstractFSNodeStore { this.fsWorkingPath = fsStorePath; this.manager = mgr; initFileSystem(conf); - // mkdir of root dir path - fs.mkdirs(fsWorkingPath); + // mkdir of root dir path with retry logic + int maxRetries = conf.getInt(YarnConfiguration.NODE_STORE_ROOT_DIR_NUM_RETRIES, + YarnConfiguration.NODE_STORE_ROOT_DIR_NUM_DEFAULT_RETRIES); + int retryCount = 0; + boolean success = fs.mkdirs(fsWorkingPath); + + while (!success && retryCount < maxRetries) { + try { + if (!fs.exists(fsWorkingPath)) { + success = fs.mkdirs(fsWorkingPath); + } else { + success = true; + } + } catch (IOException e) { + retryCount++; + if (retryCount >= maxRetries) { + throw e; + } + try { + Thread.sleep(conf.getInt(YarnConfiguration.NODE_STORE_ROOT_DIR_RETRY_INTERVAL, + YarnConfiguration.NODE_STORE_ROOT_DIR_RETRY_DEFAULT_INTERVAL)); + } catch (InterruptedException ie) { + throw new RuntimeException(ie); + } + } + } this.replication = conf.getInt(YarnConfiguration.FS_STORE_FILE_REPLICATION, YarnConfiguration.DEFAULT_FS_STORE_FILE_REPLICATION); LOG.info("Created store directory :" + fsWorkingPath); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml index b9385d1c276..4fc414f0e01 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml @@ -5177,4 +5177,20 @@ 1 + + + Number of Retries while trying to make root directory for node store. + + yarn.resourcemanager.nodestore-rootdir.num-retries + 1000 + + + + + Interval in ms between retries while trying to make root directory for node store. + + yarn.resourcemanager.nodestore-rootdir.retry-interval-ms + 1000 + + diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/nodelabels/TestFileSystemNodeLabelsStore.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/nodelabels/TestFileSystemNodeLabelsStore.java index 099684318f4..a861b0654ea 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/nodelabels/TestFileSystemNodeLabelsStore.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/nodelabels/TestFileSystemNodeLabelsStore.java @@ -359,9 +359,6 @@ public class TestFileSystemNodeLabelsStore extends NodeLabelTestBase { mockStore.setFs(mockFs); verifyMkdirsCount(mockStore, true, 1); - verifyMkdirsCount(mockStore, false, 2); - verifyMkdirsCount(mockStore, true, 3); - verifyMkdirsCount(mockStore, false, 4); } private void verifyMkdirsCount(FileSystemNodeLabelsStore store,