YARN-11463. Node Labels root directory creation doesn't have a retry logic (#5562)

Co-authored-by: Ashutosh Gupta <ashugpt@amazon.com>
This commit is contained in:
Ashutosh Gupta 2023-04-21 13:03:22 +01:00 committed by GitHub
parent 9e24ed2196
commit 964c1902c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 52 additions and 5 deletions

View File

@ -217,6 +217,16 @@ public class YarnConfiguration extends Configuration {
public static final int DEFAULT_RM_APPLICATION_MAX_TAG_LENGTH = 100; public static final int DEFAULT_RM_APPLICATION_MAX_TAG_LENGTH = 100;
public static final String NODE_STORE_ROOT_DIR_NUM_RETRIES =
RM_PREFIX + "nodestore-rootdir.num-retries";
public static final int NODE_STORE_ROOT_DIR_NUM_DEFAULT_RETRIES = 1000;
public static final String NODE_STORE_ROOT_DIR_RETRY_INTERVAL =
RM_PREFIX + "nodestore-rootdir.retry-interval-ms";
public static final int NODE_STORE_ROOT_DIR_RETRY_DEFAULT_INTERVAL = 1000;
public static final String RM_APPLICATION_MASTER_SERVICE_PROCESSORS = public static final String RM_APPLICATION_MASTER_SERVICE_PROCESSORS =
RM_PREFIX + "application-master-service.processors"; RM_PREFIX + "application-master-service.processors";

View File

@ -65,8 +65,32 @@ public abstract class AbstractFSNodeStore<M> {
this.fsWorkingPath = fsStorePath; this.fsWorkingPath = fsStorePath;
this.manager = mgr; this.manager = mgr;
initFileSystem(conf); initFileSystem(conf);
// mkdir of root dir path // mkdir of root dir path with retry logic
fs.mkdirs(fsWorkingPath); int maxRetries = conf.getInt(YarnConfiguration.NODE_STORE_ROOT_DIR_NUM_RETRIES,
YarnConfiguration.NODE_STORE_ROOT_DIR_NUM_DEFAULT_RETRIES);
int retryCount = 0;
boolean success = fs.mkdirs(fsWorkingPath);
while (!success && retryCount < maxRetries) {
try {
if (!fs.exists(fsWorkingPath)) {
success = fs.mkdirs(fsWorkingPath);
} else {
success = true;
}
} catch (IOException e) {
retryCount++;
if (retryCount >= maxRetries) {
throw e;
}
try {
Thread.sleep(conf.getInt(YarnConfiguration.NODE_STORE_ROOT_DIR_RETRY_INTERVAL,
YarnConfiguration.NODE_STORE_ROOT_DIR_RETRY_DEFAULT_INTERVAL));
} catch (InterruptedException ie) {
throw new RuntimeException(ie);
}
}
}
this.replication = conf.getInt(YarnConfiguration.FS_STORE_FILE_REPLICATION, this.replication = conf.getInt(YarnConfiguration.FS_STORE_FILE_REPLICATION,
YarnConfiguration.DEFAULT_FS_STORE_FILE_REPLICATION); YarnConfiguration.DEFAULT_FS_STORE_FILE_REPLICATION);
LOG.info("Created store directory :" + fsWorkingPath); LOG.info("Created store directory :" + fsWorkingPath);

View File

@ -5177,4 +5177,20 @@
<value>1</value> <value>1</value>
</property> </property>
<property>
<description>
Number of Retries while trying to make root directory for node store.
</description>
<name>yarn.resourcemanager.nodestore-rootdir.num-retries</name>
<value>1000</value>
</property>
<property>
<description>
Interval in ms between retries while trying to make root directory for node store.
</description>
<name>yarn.resourcemanager.nodestore-rootdir.retry-interval-ms</name>
<value>1000</value>
</property>
</configuration> </configuration>

View File

@ -359,9 +359,6 @@ public class TestFileSystemNodeLabelsStore extends NodeLabelTestBase {
mockStore.setFs(mockFs); mockStore.setFs(mockFs);
verifyMkdirsCount(mockStore, true, 1); verifyMkdirsCount(mockStore, true, 1);
verifyMkdirsCount(mockStore, false, 2);
verifyMkdirsCount(mockStore, true, 3);
verifyMkdirsCount(mockStore, false, 4);
} }
private void verifyMkdirsCount(FileSystemNodeLabelsStore store, private void verifyMkdirsCount(FileSystemNodeLabelsStore store,