Only allow the master to update the list of nodes in the cluster state (#21092)

The cluster state on a node is updated either - by incoming cluster states that are received from the active master or - by the node itself when it notices that the master has gone. In the second case, the node adds the NO_MASTER_BLOCK and removes the current master as active master from its cluster state. In one particular case, it would also update the list of nodes, removing the master node that just failed. In the future, we want a clear separation between actions that can be executed by a master publishing a cluster state and a node locally updating its cluster state when no active master is around.
2016-10-26 09:24:03 +02:00 · 2016-10-26 09:24:03 +02:00 · e82a1f5cca
parent e6dda02c66
commit e82a1f5cca
3 changed files with 9 additions and 8 deletions
--- a/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
+++ b/core/src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
@ -683,15 +683,10 @@ public class ZenDiscovery extends AbstractLifecycleComponent implements Discover
                    return currentState;
                }
                DiscoveryNodes discoveryNodes = DiscoveryNodes.builder(currentState.nodes())
                        // make sure the old master node, which has failed, is not part of the nodes we publish
                        .remove(masterNode)
                        .masterNodeId(null).build();
                // flush any pending cluster states from old master, so it will not be set as master again
                publishClusterState.pendingStatesQueue().failAllStatesAndClear(new ElasticsearchException("master left [{}]", reason));
-                return rejoin(ClusterState.builder(currentState).nodes(discoveryNodes).build(), "master left (reason = " + reason + ")");
+                return rejoin(currentState, "master left (reason = " + reason + ")");
            }
            @Override
--- a/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java
+++ b/core/src/test/java/org/elasticsearch/cluster/MinimumMasterNodesIT.java
@ -132,7 +132,9 @@ public class MinimumMasterNodesIT extends ESIntegTestCase {
        });
        state = client().admin().cluster().prepareState().setLocal(true).execute().actionGet().getState();
        assertThat(state.blocks().hasGlobalBlock(DiscoverySettings.NO_MASTER_BLOCK_ID), equalTo(true));
-        assertThat(state.nodes().getSize(), equalTo(1)); // verify that we still see the local node in the cluster state
+        // verify that both nodes are still in the cluster state but there is no master
        assertThat(state.nodes().getSize(), equalTo(2));
        assertThat(state.nodes().getMasterNode(), equalTo(null));
        logger.info("--> starting the previous master node again...");
        internalCluster().startNode(settings);
--- a/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java
+++ b/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java
@ -993,7 +993,11 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase {
        String isolatedNode = randomBoolean() ? masterNode : nonMasterNode;
        TwoPartitions partitions = isolateNode(isolatedNode);
-        NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
+        // we cannot use the NetworkUnresponsive disruption type here as it will swallow the "shard failed" request, calling neither
        // onSuccess nor onFailure on the provided listener.
        NetworkLinkDisruptionType disruptionType = new NetworkDisconnect();
        NetworkDisruption networkDisruption = new NetworkDisruption(partitions, disruptionType);
        setDisruptionScheme(networkDisruption);
        networkDisruption.startDisrupting();
        service.localShardFailed(failedShard, "simulated", new CorruptIndexException("simulated", (String) null), new