Have a dedicated join timeout that is higher than ping.timeout for node join

Using ping.timeout, which defaults to 3s, to use as a timeout value on the join request a node makes to the master once its discovered can be too small, specifically when there is a large cluster state involved (and by definition, all the buffers and such on the nio layer will be "cold"). Introduce a dedicated join.timeout setting, that by default is 10x the ping.timeout (so 30s by default).
closes #6342
This commit is contained in:
Shay Banon 2014-05-30 00:49:46 +02:00
parent 0e2d33b4a4
commit 9c98bb3554
2 changed files with 8 additions and 4 deletions

View File

@ -72,7 +72,9 @@ As part of the initial ping process a master of the cluster is either
elected or joined to. This is done automatically. The elected or joined to. This is done automatically. The
`discovery.zen.ping_timeout` (which defaults to `3s`) allows to `discovery.zen.ping_timeout` (which defaults to `3s`) allows to
configure the election to handle cases of slow or congested networks configure the election to handle cases of slow or congested networks
(higher values assure less chance of failure). (higher values assure less chance of failure). Once a node joins, it
will send a join request to the master (`discovery.zen.join_timeout`)
with a timeout defaulting at 10 times the ping timeout.
Nodes can be excluded from becoming a master by setting `node.master` to Nodes can be excluded from becoming a master by setting `node.master` to
`false`. Note, once a node is a client node (`node.client` set to `false`. Note, once a node is a client node (`node.client` set to

View File

@ -93,6 +93,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
private final TimeValue pingTimeout; private final TimeValue pingTimeout;
private final TimeValue joinTimeout;
// a flag that should be used only for testing // a flag that should be used only for testing
private final boolean sendLeaveRequest; private final boolean sendLeaveRequest;
@ -134,12 +135,13 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
// also support direct discovery.zen settings, for cases when it gets extended // also support direct discovery.zen settings, for cases when it gets extended
this.pingTimeout = settings.getAsTime("discovery.zen.ping.timeout", settings.getAsTime("discovery.zen.ping_timeout", componentSettings.getAsTime("ping_timeout", componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3))))); this.pingTimeout = settings.getAsTime("discovery.zen.ping.timeout", settings.getAsTime("discovery.zen.ping_timeout", componentSettings.getAsTime("ping_timeout", componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3)))));
this.joinTimeout = settings.getAsTime("discovery.zen.join_timeout", TimeValue.timeValueMillis(pingTimeout.millis() * 10));
this.sendLeaveRequest = componentSettings.getAsBoolean("send_leave_request", true); this.sendLeaveRequest = componentSettings.getAsBoolean("send_leave_request", true);
this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true); this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
this.masterElectionFilterDataNodes = settings.getAsBoolean("discovery.zen.master_election.filter_data", false); this.masterElectionFilterDataNodes = settings.getAsBoolean("discovery.zen.master_election.filter_data", false);
logger.debug("using ping.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes); logger.debug("using ping.timeout [{}], join.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, joinTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
this.electMaster = new ElectMasterService(settings); this.electMaster = new ElectMasterService(settings);
nodeSettingsService.addListener(new ApplySettings()); nodeSettingsService.addListener(new ApplySettings());
@ -343,7 +345,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
} }
// send join request // send join request
try { try {
membership.sendJoinRequestBlocking(masterNode, localNode, pingTimeout); membership.sendJoinRequestBlocking(masterNode, localNode, joinTimeout);
} catch (Exception e) { } catch (Exception e) {
if (e instanceof ElasticsearchException) { if (e instanceof ElasticsearchException) {
logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ((ElasticsearchException) e).getDetailedMessage()); logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ((ElasticsearchException) e).getDetailedMessage());
@ -713,7 +715,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
// validate the join request, will throw a failure if it fails, which will get back to the // validate the join request, will throw a failure if it fails, which will get back to the
// node calling the join request // node calling the join request
membership.sendValidateJoinRequestBlocking(node, state, pingTimeout); membership.sendValidateJoinRequestBlocking(node, state, joinTimeout);
clusterService.submitStateUpdateTask("zen-disco-receive(join from node[" + node + "])", Priority.IMMEDIATE, new ClusterStateUpdateTask() { clusterService.submitStateUpdateTask("zen-disco-receive(join from node[" + node + "])", Priority.IMMEDIATE, new ClusterStateUpdateTask() {
@Override @Override