Only allow the master to update the list of nodes in the cluster state (#21092)
The cluster state on a node is updated either - by incoming cluster states that are received from the active master or - by the node itself when it notices that the master has gone. In the second case, the node adds the NO_MASTER_BLOCK and removes the current master as active master from its cluster state. In one particular case, it would also update the list of nodes, removing the master node that just failed. In the future, we want a clear separation between actions that can be executed by a master publishing a cluster state and a node locally updating its cluster state when no active master is around.
This commit is contained in:
parent
e6dda02c66
commit
e82a1f5cca
|
@ -683,15 +683,10 @@ public class ZenDiscovery extends AbstractLifecycleComponent implements Discover
|
||||||
return currentState;
|
return currentState;
|
||||||
}
|
}
|
||||||
|
|
||||||
DiscoveryNodes discoveryNodes = DiscoveryNodes.builder(currentState.nodes())
|
|
||||||
// make sure the old master node, which has failed, is not part of the nodes we publish
|
|
||||||
.remove(masterNode)
|
|
||||||
.masterNodeId(null).build();
|
|
||||||
|
|
||||||
// flush any pending cluster states from old master, so it will not be set as master again
|
// flush any pending cluster states from old master, so it will not be set as master again
|
||||||
publishClusterState.pendingStatesQueue().failAllStatesAndClear(new ElasticsearchException("master left [{}]", reason));
|
publishClusterState.pendingStatesQueue().failAllStatesAndClear(new ElasticsearchException("master left [{}]", reason));
|
||||||
|
|
||||||
return rejoin(ClusterState.builder(currentState).nodes(discoveryNodes).build(), "master left (reason = " + reason + ")");
|
return rejoin(currentState, "master left (reason = " + reason + ")");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -132,7 +132,9 @@ public class MinimumMasterNodesIT extends ESIntegTestCase {
|
||||||
});
|
});
|
||||||
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
|
state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
|
||||||
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
|
assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
|
||||||
assertThat(state.nodes().getSize(), equalTo(1)); // verify that we still see the local node in the cluster state
|
// verify that both nodes are still in the cluster state but there is no master
|
||||||
|
assertThat(state.nodes().getSize(), equalTo(2));
|
||||||
|
assertThat(state.nodes().getMasterNode(), equalTo(null));
|
||||||
|
|
||||||
logger.info("--> starting the previous master node again...");
|
logger.info("--> starting the previous master node again...");
|
||||||
internalCluster().startNode(settings);
|
internalCluster().startNode(settings);
|
||||||
|
|
|
@ -993,7 +993,11 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase {
|
||||||
|
|
||||||
String isolatedNode = randomBoolean() ? masterNode : nonMasterNode;
|
String isolatedNode = randomBoolean() ? masterNode : nonMasterNode;
|
||||||
TwoPartitions partitions = isolateNode(isolatedNode);
|
TwoPartitions partitions = isolateNode(isolatedNode);
|
||||||
NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
|
// we cannot use the NetworkUnresponsive disruption type here as it will swallow the "shard failed" request, calling neither
|
||||||
|
// onSuccess nor onFailure on the provided listener.
|
||||||
|
NetworkLinkDisruptionType disruptionType = new NetworkDisconnect();
|
||||||
|
NetworkDisruption networkDisruption = new NetworkDisruption(partitions, disruptionType);
|
||||||
|
setDisruptionScheme(networkDisruption);
|
||||||
networkDisruption.startDisrupting();
|
networkDisruption.startDisrupting();
|
||||||
|
|
||||||
service.localShardFailed(failedShard, "simulated", new CorruptIndexException("simulated", (String) null), new
|
service.localShardFailed(failedShard, "simulated", new CorruptIndexException("simulated", (String) null), new
|
||||||
|
|
Loading…
Reference in New Issue