Have a dedicated join timeout that is higher than ping.timeout for node join
Using ping.timeout, which defaults to 3s, to use as a timeout value on the join request a node makes to the master once its discovered can be too small, specifically when there is a large cluster state involved (and by definition, all the buffers and such on the nio layer will be "cold"). Introduce a dedicated join.timeout setting, that by default is 10x the ping.timeout (so 30s by default). closes #6342
This commit is contained in:
parent
0e2d33b4a4
commit
9c98bb3554
|
@ -72,7 +72,9 @@ As part of the initial ping process a master of the cluster is either
|
|||
elected or joined to. This is done automatically. The
|
||||
`discovery.zen.ping_timeout` (which defaults to `3s`) allows to
|
||||
configure the election to handle cases of slow or congested networks
|
||||
(higher values assure less chance of failure).
|
||||
(higher values assure less chance of failure). Once a node joins, it
|
||||
will send a join request to the master (`discovery.zen.join_timeout`)
|
||||
with a timeout defaulting at 10 times the ping timeout.
|
||||
|
||||
Nodes can be excluded from becoming a master by setting `node.master` to
|
||||
`false`. Note, once a node is a client node (`node.client` set to
|
||||
|
|
|
@ -93,6 +93,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
|
||||
|
||||
private final TimeValue pingTimeout;
|
||||
private final TimeValue joinTimeout;
|
||||
|
||||
// a flag that should be used only for testing
|
||||
private final boolean sendLeaveRequest;
|
||||
|
@ -134,12 +135,13 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
|
||||
// also support direct discovery.zen settings, for cases when it gets extended
|
||||
this.pingTimeout = settings.getAsTime("discovery.zen.ping.timeout", settings.getAsTime("discovery.zen.ping_timeout", componentSettings.getAsTime("ping_timeout", componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3)))));
|
||||
this.joinTimeout = settings.getAsTime("discovery.zen.join_timeout", TimeValue.timeValueMillis(pingTimeout.millis() * 10));
|
||||
this.sendLeaveRequest = componentSettings.getAsBoolean("send_leave_request", true);
|
||||
|
||||
this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
|
||||
this.masterElectionFilterDataNodes = settings.getAsBoolean("discovery.zen.master_election.filter_data", false);
|
||||
|
||||
logger.debug("using ping.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
|
||||
logger.debug("using ping.timeout [{}], join.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, joinTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
|
||||
|
||||
this.electMaster = new ElectMasterService(settings);
|
||||
nodeSettingsService.addListener(new ApplySettings());
|
||||
|
@ -343,7 +345,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
}
|
||||
// send join request
|
||||
try {
|
||||
membership.sendJoinRequestBlocking(masterNode, localNode, pingTimeout);
|
||||
membership.sendJoinRequestBlocking(masterNode, localNode, joinTimeout);
|
||||
} catch (Exception e) {
|
||||
if (e instanceof ElasticsearchException) {
|
||||
logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ((ElasticsearchException) e).getDetailedMessage());
|
||||
|
@ -713,7 +715,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
|
|||
|
||||
// validate the join request, will throw a failure if it fails, which will get back to the
|
||||
// node calling the join request
|
||||
membership.sendValidateJoinRequestBlocking(node, state, pingTimeout);
|
||||
membership.sendValidateJoinRequestBlocking(node, state, joinTimeout);
|
||||
|
||||
clusterService.submitStateUpdateTask("zen-disco-receive(join from node[" + node + "])", Priority.IMMEDIATE, new ClusterStateUpdateTask() {
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue