[Discovery] Handle ConnectionTransportException during a Master/Node fault detection ping
Both the Master and Node fault detection register themselves to be notified when a node disconnects to be able to respond to it accordingly. As such, when a ConnectionTransportException was raised on a ping request, it was not handled as it is already handled somewhere else. However, this does introduce a racing condition, if the disconnect happen during a period where there is no current master (minimum_master_node breach) at which time the fault detection is not active. In this case, we will only discover the disconnect error during the ping request, so we have to respond accordingly. Closes #6686
This commit is contained in:
parent
3b959706b3
commit
8909a77724
|
@ -296,14 +296,13 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
if (!running) {
|
if (!running) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (exp instanceof ConnectTransportException) {
|
|
||||||
// ignore this one, we already handle it by registering a connection listener
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
synchronized (masterNodeMutex) {
|
synchronized (masterNodeMutex) {
|
||||||
// check if the master node did not get switched on us...
|
// check if the master node did not get switched on us...
|
||||||
if (masterToPing.equals(MasterFaultDetection.this.masterNode())) {
|
if (masterToPing.equals(MasterFaultDetection.this.masterNode())) {
|
||||||
if (exp.getCause() instanceof NoLongerMasterException) {
|
if (exp instanceof ConnectTransportException) {
|
||||||
|
handleTransportDisconnect(masterToPing);
|
||||||
|
return;
|
||||||
|
} else if (exp.getCause() instanceof NoLongerMasterException) {
|
||||||
logger.debug("[master] pinging a master {} that is no longer a master", masterNode);
|
logger.debug("[master] pinging a master {} that is no longer a master", masterNode);
|
||||||
notifyMasterFailure(masterToPing, "no longer master");
|
notifyMasterFailure(masterToPing, "no longer master");
|
||||||
return;
|
return;
|
||||||
|
@ -316,6 +315,7 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
notifyMasterFailure(masterToPing, "do not exists on master, act as master failure");
|
notifyMasterFailure(masterToPing, "do not exists on master, act as master failure");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int retryCount = ++MasterFaultDetection.this.retryCount;
|
int retryCount = ++MasterFaultDetection.this.retryCount;
|
||||||
logger.trace("[master] failed to ping [{}], retry [{}] out of [{}]", exp, masterNode, retryCount, pingRetryCount);
|
logger.trace("[master] failed to ping [{}], retry [{}] out of [{}]", exp, masterNode, retryCount, pingRetryCount);
|
||||||
if (retryCount >= pingRetryCount) {
|
if (retryCount >= pingRetryCount) {
|
||||||
|
@ -334,7 +334,8 @@ public class MasterFaultDetection extends AbstractComponent {
|
||||||
public String executor() {
|
public String executor() {
|
||||||
return ThreadPool.Names.SAME;
|
return ThreadPool.Names.SAME;
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -228,15 +228,16 @@ public class NodesFaultDetection extends AbstractComponent {
|
||||||
if (!running) {
|
if (!running) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (exp instanceof ConnectTransportException) {
|
|
||||||
// ignore this one, we already handle it by registering a connection listener
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
NodeFD nodeFD = nodesFD.get(node);
|
NodeFD nodeFD = nodesFD.get(node);
|
||||||
if (nodeFD != null) {
|
if (nodeFD != null) {
|
||||||
if (!nodeFD.running) {
|
if (!nodeFD.running) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (exp instanceof ConnectTransportException) {
|
||||||
|
handleTransportDisconnect(node);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
int retryCount = ++nodeFD.retryCount;
|
int retryCount = ++nodeFD.retryCount;
|
||||||
logger.trace("[node ] failed to ping [{}], retry [{}] out of [{}]", exp, node, retryCount, pingRetryCount);
|
logger.trace("[node ] failed to ping [{}], retry [{}] out of [{}]", exp, node, retryCount, pingRetryCount);
|
||||||
if (retryCount >= pingRetryCount) {
|
if (retryCount >= pingRetryCount) {
|
||||||
|
@ -257,7 +258,8 @@ public class NodesFaultDetection extends AbstractComponent {
|
||||||
public String executor() {
|
public String executor() {
|
||||||
return ThreadPool.Names.SAME;
|
return ThreadPool.Names.SAME;
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue