Fix node failure detection race when updating cluster state

Failure detection should only be updated in ZenDiscovery after the current state has been updated to prevent a race condition
with handleLeaveRequest and handleNodeFailure as those check the current state to determine whether the failure is to be handled by this node.
This commit is contained in:
Yannick Welsch 2017-04-28 19:52:27 +02:00
parent 55daf743d7
commit 9c55bca8fb
1 changed files with 13 additions and 10 deletions

View File

@ -775,16 +775,6 @@ public class ZenDiscovery extends AbstractLifecycleComponent implements Discover
return false; return false;
} }
if (newClusterState.nodes().isLocalNodeElectedMaster()) {
// update the set of nodes to ping
nodesFD.updateNodesAndPing(newClusterState);
} else {
// check to see that we monitor the correct master of the cluster
if (masterFD.masterNode() == null || !masterFD.masterNode().equals(newClusterState.nodes().getMasterNode())) {
masterFD.restart(newClusterState.nodes().getMasterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]");
}
}
if (currentState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock())) { if (currentState.blocks().hasGlobalBlock(discoverySettings.getNoMasterBlock())) {
// its a fresh update from the master as we transition from a start of not having a master to having one // its a fresh update from the master as we transition from a start of not having a master to having one
logger.debug("got first state from fresh master [{}]", newClusterState.nodes().getMasterNodeId()); logger.debug("got first state from fresh master [{}]", newClusterState.nodes().getMasterNodeId());
@ -827,6 +817,19 @@ public class ZenDiscovery extends AbstractLifecycleComponent implements Discover
state.set(adaptedNewClusterState); state.set(adaptedNewClusterState);
// update failure detection only after the state has been updated to prevent race condition with handleLeaveRequest
// and handleNodeFailure as those check the current state to determine whether the failure is to be handled by this node
if (adaptedNewClusterState.nodes().isLocalNodeElectedMaster()) {
// update the set of nodes to ping
nodesFD.updateNodesAndPing(adaptedNewClusterState);
} else {
// check to see that we monitor the correct master of the cluster
if (masterFD.masterNode() == null || !masterFD.masterNode().equals(adaptedNewClusterState.nodes().getMasterNode())) {
masterFD.restart(adaptedNewClusterState.nodes().getMasterNode(),
"new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]");
}
}
clusterApplier.onNewClusterState("apply cluster state (from master [" + reason + "])", clusterApplier.onNewClusterState("apply cluster state (from master [" + reason + "])",
this::clusterState, this::clusterState,
new ClusterStateTaskListener() { new ClusterStateTaskListener() {