From 8909a77724d239cce805cc1bf84c13850d7a78a3 Mon Sep 17 00:00:00 2001 From: Boaz Leskes Date: Wed, 2 Jul 2014 16:27:35 +0200 Subject: [PATCH] [Discovery] Handle ConnectionTransportException during a Master/Node fault detection ping Both the Master and Node fault detection register themselves to be notified when a node disconnects to be able to respond to it accordingly. As such, when a ConnectionTransportException was raised on a ping request, it was not handled as it is already handled somewhere else. However, this does introduce a racing condition, if the disconnect happen during a period where there is no current master (minimum_master_node breach) at which time the fault detection is not active. In this case, we will only discover the disconnect error during the ping request, so we have to respond accordingly. Closes #6686 --- .../discovery/zen/fd/MasterFaultDetection.java | 13 +++++++------ .../discovery/zen/fd/NodesFaultDetection.java | 12 +++++++----- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java index 6e6124c6d32..a438d3bc736 100644 --- a/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java +++ b/src/main/java/org/elasticsearch/discovery/zen/fd/MasterFaultDetection.java @@ -296,14 +296,13 @@ public class MasterFaultDetection extends AbstractComponent { if (!running) { return; } - if (exp instanceof ConnectTransportException) { - // ignore this one, we already handle it by registering a connection listener - return; - } synchronized (masterNodeMutex) { // check if the master node did not get switched on us... if (masterToPing.equals(MasterFaultDetection.this.masterNode())) { - if (exp.getCause() instanceof NoLongerMasterException) { + if (exp instanceof ConnectTransportException) { + handleTransportDisconnect(masterToPing); + return; + } else if (exp.getCause() instanceof NoLongerMasterException) { logger.debug("[master] pinging a master {} that is no longer a master", masterNode); notifyMasterFailure(masterToPing, "no longer master"); return; @@ -316,6 +315,7 @@ public class MasterFaultDetection extends AbstractComponent { notifyMasterFailure(masterToPing, "do not exists on master, act as master failure"); return; } + int retryCount = ++MasterFaultDetection.this.retryCount; logger.trace("[master] failed to ping [{}], retry [{}] out of [{}]", exp, masterNode, retryCount, pingRetryCount); if (retryCount >= pingRetryCount) { @@ -334,7 +334,8 @@ public class MasterFaultDetection extends AbstractComponent { public String executor() { return ThreadPool.Names.SAME; } - }); + } + ); } } diff --git a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java index 4271a000e58..ed300343745 100644 --- a/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java +++ b/src/main/java/org/elasticsearch/discovery/zen/fd/NodesFaultDetection.java @@ -228,15 +228,16 @@ public class NodesFaultDetection extends AbstractComponent { if (!running) { return; } - if (exp instanceof ConnectTransportException) { - // ignore this one, we already handle it by registering a connection listener - return; - } NodeFD nodeFD = nodesFD.get(node); if (nodeFD != null) { if (!nodeFD.running) { return; } + if (exp instanceof ConnectTransportException) { + handleTransportDisconnect(node); + return; + } + int retryCount = ++nodeFD.retryCount; logger.trace("[node ] failed to ping [{}], retry [{}] out of [{}]", exp, node, retryCount, pingRetryCount); if (retryCount >= pingRetryCount) { @@ -257,7 +258,8 @@ public class NodesFaultDetection extends AbstractComponent { public String executor() { return ThreadPool.Names.SAME; } - }); + } + ); } }