default to try and connect after network disconnection with fault detection

This commit is contained in:
kimchy 2010-12-23 16:51:09 +02:00
parent e313379ed5
commit de7dd3c070
2 changed files with 26 additions and 2 deletions

View File

@ -92,7 +92,7 @@ public class MasterFaultDetection extends AbstractComponent {
this.transportService = transportService; this.transportService = transportService;
this.nodesProvider = nodesProvider; this.nodesProvider = nodesProvider;
this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false); this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", true);
this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1)); this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
this.pingRetryTimeout = componentSettings.getAsTime("ping_timeout", timeValueSeconds(30)); this.pingRetryTimeout = componentSettings.getAsTime("ping_timeout", timeValueSeconds(30));
this.pingRetryCount = componentSettings.getAsInt("ping_retries", 3); this.pingRetryCount = componentSettings.getAsInt("ping_retries", 3);
@ -196,6 +196,12 @@ public class MasterFaultDetection extends AbstractComponent {
if (connectOnNetworkDisconnect) { if (connectOnNetworkDisconnect) {
try { try {
transportService.connectToNode(node); transportService.connectToNode(node);
// if all is well, make sure we restart the pinger
if (masterPinger != null) {
masterPinger.stop();
}
this.masterPinger = new MasterPinger();
threadPool.schedule(masterPinger, pingInterval);
} catch (Exception e) { } catch (Exception e) {
logger.trace("[master] [{}] transport disconnected (with verified connect)", masterNode); logger.trace("[master] [{}] transport disconnected (with verified connect)", masterNode);
notifyMasterFailure(masterNode, "transport disconnected (with verified connect)"); notifyMasterFailure(masterNode, "transport disconnected (with verified connect)");
@ -285,6 +291,10 @@ public class MasterFaultDetection extends AbstractComponent {
if (!running) { if (!running) {
return; return;
} }
if (exp instanceof ConnectTransportException) {
// ignore this one, we already handle it by registering a connection listener
return;
}
synchronized (masterNodeMutex) { synchronized (masterNodeMutex) {
// check if the master node did not get switched on us... // check if the master node did not get switched on us...
if (masterToPing.equals(MasterFaultDetection.this.masterNode())) { if (masterToPing.equals(MasterFaultDetection.this.masterNode())) {

View File

@ -84,7 +84,7 @@ public class NodesFaultDetection extends AbstractComponent {
this.threadPool = threadPool; this.threadPool = threadPool;
this.transportService = transportService; this.transportService = transportService;
this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", false); this.connectOnNetworkDisconnect = componentSettings.getAsBoolean("connect_on_network_disconnect", true);
this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1)); this.pingInterval = componentSettings.getAsTime("ping_interval", timeValueSeconds(1));
this.pingRetryTimeout = componentSettings.getAsTime("ping_timeout", timeValueSeconds(30)); this.pingRetryTimeout = componentSettings.getAsTime("ping_timeout", timeValueSeconds(30));
this.pingRetryCount = componentSettings.getAsInt("ping_retries", 3); this.pingRetryCount = componentSettings.getAsInt("ping_retries", 3);
@ -163,9 +163,12 @@ public class NodesFaultDetection extends AbstractComponent {
if (!running) { if (!running) {
return; return;
} }
nodeFD.running = false;
if (connectOnNetworkDisconnect) { if (connectOnNetworkDisconnect) {
try { try {
transportService.connectToNode(node); transportService.connectToNode(node);
nodesFD.put(node, new NodeFD());
threadPool.schedule(new SendPingRequest(node), pingInterval);
} catch (Exception e) { } catch (Exception e) {
logger.trace("[node ] [{}] transport disconnected (with verified connect)", node); logger.trace("[node ] [{}] transport disconnected (with verified connect)", node);
notifyNodeFailure(node, "transport disconnected (with verified connect)"); notifyNodeFailure(node, "transport disconnected (with verified connect)");
@ -210,6 +213,9 @@ public class NodesFaultDetection extends AbstractComponent {
} }
NodeFD nodeFD = nodesFD.get(node); NodeFD nodeFD = nodesFD.get(node);
if (nodeFD != null) { if (nodeFD != null) {
if (!nodeFD.running) {
return;
}
nodeFD.retryCount = 0; nodeFD.retryCount = 0;
threadPool.schedule(SendPingRequest.this, pingInterval); threadPool.schedule(SendPingRequest.this, pingInterval);
} }
@ -220,8 +226,15 @@ public class NodesFaultDetection extends AbstractComponent {
if (!running) { if (!running) {
return; return;
} }
if (exp instanceof ConnectTransportException) {
// ignore this one, we already handle it by registering a connection listener
return;
}
NodeFD nodeFD = nodesFD.get(node); NodeFD nodeFD = nodesFD.get(node);
if (nodeFD != null) { if (nodeFD != null) {
if (!nodeFD.running) {
return;
}
int retryCount = ++nodeFD.retryCount; int retryCount = ++nodeFD.retryCount;
logger.trace("[node ] failed to ping [{}], retry [{}] out of [{}]", exp, node, retryCount, pingRetryCount); logger.trace("[node ] failed to ping [{}], retry [{}] out of [{}]", exp, node, retryCount, pingRetryCount);
if (retryCount >= pingRetryCount) { if (retryCount >= pingRetryCount) {
@ -247,6 +260,7 @@ public class NodesFaultDetection extends AbstractComponent {
static class NodeFD { static class NodeFD {
volatile int retryCount; volatile int retryCount;
volatile boolean running = true;
} }
private class FDConnectionListener implements TransportConnectionListener { private class FDConnectionListener implements TransportConnectionListener {