From 617c325c70bf33e4e9e7ae8016c8f2b777869526 Mon Sep 17 00:00:00 2001 From: Atul Mohan Date: Tue, 21 Mar 2023 02:15:28 -0700 Subject: [PATCH] Make zk connection retries configurable (#13913) * This makes the zookeeper connection retry count configurable. This is presently hardcoded to 29 tries which ends up taking a long time for the druid node to shutdown in case of ZK connectivity loss. Having a shorter retry count helps k8s deployments to fail fast. In situations where the underlying k8s node loses network connectivity or is no longer able to talk to zookeeper, failing fast can trigger pod restarts which can then reassign the pod to a healthy k8s node. Existing behavior is preserved, but users can override this property if needed. --- .../java/org/apache/druid/curator/CuratorConfig.java | 10 ++++++++++ .../java/org/apache/druid/curator/CuratorModule.java | 3 +-- .../org/apache/druid/curator/CuratorConfigTest.java | 3 +++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/apache/druid/curator/CuratorConfig.java b/server/src/main/java/org/apache/druid/curator/CuratorConfig.java index 68ce0812deb..7a53ee941d7 100644 --- a/server/src/main/java/org/apache/druid/curator/CuratorConfig.java +++ b/server/src/main/java/org/apache/druid/curator/CuratorConfig.java @@ -58,6 +58,11 @@ public class CuratorConfig @JsonProperty("authScheme") private String authScheme = "digest"; + // Configures the maximum number of retries for attempting connection to Zookeeper. + // Smaller retry counts helps nodes to fail fast in case of ZK connection loss. + @JsonProperty("maxZkRetries") + private int maxZkRetries = 29; + public static CuratorConfig create(String hosts) { CuratorConfig config = new CuratorConfig(); @@ -131,4 +136,9 @@ public class CuratorConfig { return authScheme; } + + public int getMaxZkRetries() + { + return maxZkRetries; + } } diff --git a/server/src/main/java/org/apache/druid/curator/CuratorModule.java b/server/src/main/java/org/apache/druid/curator/CuratorModule.java index 07c0ad84780..201c96bfdff 100644 --- a/server/src/main/java/org/apache/druid/curator/CuratorModule.java +++ b/server/src/main/java/org/apache/druid/curator/CuratorModule.java @@ -49,7 +49,6 @@ public class CuratorModule implements Module static final int BASE_SLEEP_TIME_MS = 1000; static final int MAX_SLEEP_TIME_MS = 45000; - private static final int MAX_RETRIES = 29; private final boolean haltOnFailedStart; @@ -89,7 +88,7 @@ public class CuratorModule implements Module ); } - RetryPolicy retryPolicy = new BoundedExponentialBackoffRetry(BASE_SLEEP_TIME_MS, MAX_SLEEP_TIME_MS, MAX_RETRIES); + RetryPolicy retryPolicy = new BoundedExponentialBackoffRetry(BASE_SLEEP_TIME_MS, MAX_SLEEP_TIME_MS, config.getMaxZkRetries()); return builder .ensembleProvider(new FixedEnsembleProvider(config.getZkHosts())) diff --git a/server/src/test/java/org/apache/druid/curator/CuratorConfigTest.java b/server/src/test/java/org/apache/druid/curator/CuratorConfigTest.java index b3719d85d1f..ef4d4fc3540 100644 --- a/server/src/test/java/org/apache/druid/curator/CuratorConfigTest.java +++ b/server/src/test/java/org/apache/druid/curator/CuratorConfigTest.java @@ -33,6 +33,7 @@ public class CuratorConfigTest extends JsonConfigTesterBase propertyValues.put(getPropertyKey("user"), "test-zk-user"); propertyValues.put(getPropertyKey("pwd"), "test-zk-pwd"); propertyValues.put(getPropertyKey("authScheme"), "auth"); + propertyValues.put(getPropertyKey("maxZkRetries"), "20"); testProperties.putAll(propertyValues); configProvider.inject(testProperties, configurator); CuratorConfig config = configProvider.get().get(); @@ -41,6 +42,7 @@ public class CuratorConfigTest extends JsonConfigTesterBase Assert.assertEquals("test-zk-user", config.getZkUser()); Assert.assertEquals("test-zk-pwd", config.getZkPwd()); Assert.assertEquals("auth", config.getAuthScheme()); + Assert.assertEquals(20, config.getMaxZkRetries()); } @Test @@ -51,5 +53,6 @@ public class CuratorConfigTest extends JsonConfigTesterBase Assert.assertEquals(false, config.getEnableAcl()); Assert.assertNull(config.getZkUser()); Assert.assertEquals("digest", config.getAuthScheme()); + Assert.assertEquals(29, config.getMaxZkRetries()); } }