Make zk connection retries configurable (#13913)

* This makes the zookeeper connection retry count configurable. This is presently hardcoded to 29 tries which ends up taking a long time for the druid node to shutdown in case of ZK connectivity loss.
Having a shorter retry count helps k8s deployments to fail fast. In situations where the underlying k8s node loses network connectivity or is no longer able to talk to zookeeper, failing fast can trigger pod restarts which can then reassign the pod to a healthy k8s node.
Existing behavior is preserved, but users can override this property if needed.
This commit is contained in:
Atul Mohan 2023-03-21 02:15:28 -07:00 committed by GitHub
parent 143fdcfacf
commit 617c325c70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 2 deletions

View File

@ -58,6 +58,11 @@ public class CuratorConfig
@JsonProperty("authScheme")
private String authScheme = "digest";
// Configures the maximum number of retries for attempting connection to Zookeeper.
// Smaller retry counts helps nodes to fail fast in case of ZK connection loss.
@JsonProperty("maxZkRetries")
private int maxZkRetries = 29;
public static CuratorConfig create(String hosts)
{
CuratorConfig config = new CuratorConfig();
@ -131,4 +136,9 @@ public class CuratorConfig
{
return authScheme;
}
public int getMaxZkRetries()
{
return maxZkRetries;
}
}

View File

@ -49,7 +49,6 @@ public class CuratorModule implements Module
static final int BASE_SLEEP_TIME_MS = 1000;
static final int MAX_SLEEP_TIME_MS = 45000;
private static final int MAX_RETRIES = 29;
private final boolean haltOnFailedStart;
@ -89,7 +88,7 @@ public class CuratorModule implements Module
);
}
RetryPolicy retryPolicy = new BoundedExponentialBackoffRetry(BASE_SLEEP_TIME_MS, MAX_SLEEP_TIME_MS, MAX_RETRIES);
RetryPolicy retryPolicy = new BoundedExponentialBackoffRetry(BASE_SLEEP_TIME_MS, MAX_SLEEP_TIME_MS, config.getMaxZkRetries());
return builder
.ensembleProvider(new FixedEnsembleProvider(config.getZkHosts()))

View File

@ -33,6 +33,7 @@ public class CuratorConfigTest extends JsonConfigTesterBase<CuratorConfig>
propertyValues.put(getPropertyKey("user"), "test-zk-user");
propertyValues.put(getPropertyKey("pwd"), "test-zk-pwd");
propertyValues.put(getPropertyKey("authScheme"), "auth");
propertyValues.put(getPropertyKey("maxZkRetries"), "20");
testProperties.putAll(propertyValues);
configProvider.inject(testProperties, configurator);
CuratorConfig config = configProvider.get().get();
@ -41,6 +42,7 @@ public class CuratorConfigTest extends JsonConfigTesterBase<CuratorConfig>
Assert.assertEquals("test-zk-user", config.getZkUser());
Assert.assertEquals("test-zk-pwd", config.getZkPwd());
Assert.assertEquals("auth", config.getAuthScheme());
Assert.assertEquals(20, config.getMaxZkRetries());
}
@Test
@ -51,5 +53,6 @@ public class CuratorConfigTest extends JsonConfigTesterBase<CuratorConfig>
Assert.assertEquals(false, config.getEnableAcl());
Assert.assertNull(config.getZkUser());
Assert.assertEquals("digest", config.getAuthScheme());
Assert.assertEquals(29, config.getMaxZkRetries());
}
}