Allow election of nodes outside voting config (#43243)

Today we suppress election attempts on master-eligible nodes that are not in
the voting configuration. In fact this restriction is not necessary: any
master-eligible node can safely become master as long as it has a fresh enough
cluster state and can gather a quorum of votes. Moreover, this restriction is
sometimes undesirable: there may be a reason why we do not want any of the
nodes in the voting configuration to become master.

The reason for this restriction is as follows. If you want to shut the master
down then you might first exclude it from the voting configuration. When this
exclusion succeeds you might reasonably expect that a new master has been
elected, since the voting config exclusion is almost always a step towards
shutting the node down. If we allow nodes outside the voting configuration to
be the master then the excluded node will continue to be master, which is
confusing.

This commit adjusts the logic to allow master-eligible nodes to attempt an
election even if they are not in the voting configuration. If such a master is
successfully elected then it adds itself to the voting configuration. This
commit also adjusts the logic that causes master nodes to abdicate when they
are excluded from the voting configuration, to avoid the confusion described
above.

Relates #37712, #37802.
This commit is contained in:
David Turner 2019-06-18 12:10:01 +01:00
parent 5a9c48369b
commit 2e064e0d13
4 changed files with 98 additions and 21 deletions

View File

@ -380,9 +380,8 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
// The preVoteCollector is only active while we are candidate, but it does not call this method with synchronisation, so we have
// to check our mode again here.
if (mode == Mode.CANDIDATE) {
if (electionQuorumContainsLocalNode(getLastAcceptedState()) == false) {
logger.trace("skip election as local node is not part of election quorum: {}",
getLastAcceptedState().coordinationMetaData());
if (localNodeMayWinElection(getLastAcceptedState()) == false) {
logger.trace("skip election as local node may not win it: {}", getLastAcceptedState().coordinationMetaData());
return;
}
@ -415,16 +414,17 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
becomeCandidate("after abdicating to " + newMaster);
}
private static boolean electionQuorumContainsLocalNode(ClusterState lastAcceptedState) {
private static boolean localNodeMayWinElection(ClusterState lastAcceptedState) {
final DiscoveryNode localNode = lastAcceptedState.nodes().getLocalNode();
assert localNode != null;
return electionQuorumContains(lastAcceptedState, localNode);
return nodeMayWinElection(lastAcceptedState, localNode);
}
private static boolean electionQuorumContains(ClusterState lastAcceptedState, DiscoveryNode node) {
private static boolean nodeMayWinElection(ClusterState lastAcceptedState, DiscoveryNode node) {
final String nodeId = node.getId();
return lastAcceptedState.getLastCommittedConfiguration().getNodeIds().contains(nodeId)
|| lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId);
|| lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId)
|| lastAcceptedState.getVotingConfigExclusions().stream().noneMatch(vce -> vce.getNodeId().equals(nodeId));
}
private Optional<Join> ensureTermAtLeast(DiscoveryNode sourceNode, long targetTerm) {
@ -867,8 +867,8 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
metaDataBuilder.coordinationMetaData(coordinationMetaData);
coordinationState.get().setInitialState(ClusterState.builder(currentState).metaData(metaDataBuilder).build());
assert electionQuorumContainsLocalNode(getLastAcceptedState()) :
"initial state does not have local node in its election quorum: " + getLastAcceptedState().coordinationMetaData();
assert localNodeMayWinElection(getLastAcceptedState()) :
"initial state does not allow local node to win election: " + getLastAcceptedState().coordinationMetaData();
preVoteCollector.update(getPreVoteResponse(), null); // pick up the change to last-accepted version
startElectionScheduler();
return true;
@ -1164,8 +1164,8 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
if (mode == Mode.CANDIDATE) {
final ClusterState lastAcceptedState = coordinationState.get().getLastAcceptedState();
if (electionQuorumContainsLocalNode(lastAcceptedState) == false) {
logger.trace("skip prevoting as local node is not part of election quorum: {}",
if (localNodeMayWinElection(lastAcceptedState) == false) {
logger.trace("skip prevoting as local node may not win election: {}",
lastAcceptedState.coordinationMetaData());
return;
}
@ -1329,16 +1329,20 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
updateMaxTermSeen(getCurrentTerm());
if (mode == Mode.LEADER) {
// if necessary, abdicate to another node or improve the voting configuration
boolean attemptReconfiguration = true;
final ClusterState state = getLastAcceptedState(); // committed state
if (electionQuorumContainsLocalNode(state) == false) {
if (localNodeMayWinElection(state) == false) {
final List<DiscoveryNode> masterCandidates = completedNodes().stream()
.filter(DiscoveryNode::isMasterNode)
.filter(node -> electionQuorumContains(state, node))
.filter(node -> nodeMayWinElection(state, node))
.collect(Collectors.toList());
if (masterCandidates.isEmpty() == false) {
abdicateTo(masterCandidates.get(random.nextInt(masterCandidates.size())));
attemptReconfiguration = false;
}
} else {
}
if (attemptReconfiguration) {
scheduleReconfigurationIfNeeded();
}
}

View File

@ -150,6 +150,11 @@ public class Reconfigurator {
@Override
public int compareTo(VotingConfigNode other) {
// prefer current master
final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster);
if (currentMasterComp != 0) {
return currentMasterComp;
}
// prefer nodes that are live
final int liveComp = Boolean.compare(other.live, live);
if (liveComp != 0) {
@ -160,11 +165,6 @@ public class Reconfigurator {
if (inCurrentConfigComp != 0) {
return inCurrentConfigComp;
}
// prefer current master
final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster);
if (currentMasterComp != 0) {
return currentMasterComp;
}
// tiebreak by node id to have stable ordering
return id.compareTo(other.id);
}

View File

@ -53,7 +53,7 @@ public class ReconfiguratorTests extends ESTestCase {
check(nodes("a"), conf("a"), true, conf("a"));
check(nodes("a", "b"), conf("a"), true, conf("a"));
check(nodes("a", "b"), conf("b"), true, conf("b"));
check(nodes("a", "b"), conf("b"), true, conf("a"));
check(nodes("a", "b"), conf("a", "c"), true, conf("a"));
check(nodes("a", "b"), conf("a", "b"), true, conf("a"));
check(nodes("a", "b"), conf("a", "b", "e"), true, conf("a", "b", "e"));

View File

@ -18,17 +18,38 @@
*/
package org.elasticsearch.cluster.coordination;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsAction;
import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsRequest;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.Priority;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.transport.MockTransportService;
import org.elasticsearch.transport.TransportService;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0)
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.nullValue;
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, autoManageMasterNodes = false)
public class VotingConfigurationIT extends ESIntegTestCase {
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singletonList(MockTransportService.TestPlugin.class);
}
public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionException, InterruptedException {
internalCluster().setBootstrapMasterNodeIndex(0);
internalCluster().startNodes(2);
final String originalMaster = internalCluster().getMasterName();
@ -38,4 +59,56 @@ public class VotingConfigurationIT extends ESIntegTestCase {
client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
assertNotEquals(originalMaster, internalCluster().getMasterName());
}
public void testElectsNodeNotInVotingConfiguration() throws Exception {
internalCluster().setBootstrapMasterNodeIndex(0);
final List<String> nodeNames = internalCluster().startNodes(4);
// a 4-node cluster settles on a 3-node configuration; we then prevent the nodes in the configuration from winning an election
// by failing at the pre-voting stage, so that the extra node must be elected instead when the master shuts down. This extra node
// should then add itself into the voting configuration.
assertFalse(internalCluster().client().admin().cluster().prepareHealth()
.setWaitForNodes("4").setWaitForEvents(Priority.LANGUID).get().isTimedOut());
String excludedNodeName = null;
final ClusterState clusterState
= internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState();
final Set<String> votingConfiguration = clusterState.getLastCommittedConfiguration().getNodeIds();
assertThat(votingConfiguration, hasSize(3));
assertThat(clusterState.nodes().getSize(), equalTo(4));
assertThat(votingConfiguration, hasItem(clusterState.nodes().getMasterNodeId()));
for (DiscoveryNode discoveryNode : clusterState.nodes()) {
if (votingConfiguration.contains(discoveryNode.getId()) == false) {
assertThat(excludedNodeName, nullValue());
excludedNodeName = discoveryNode.getName();
}
}
for (final String sender : nodeNames) {
if (sender.equals(excludedNodeName)) {
continue;
}
final MockTransportService senderTransportService
= (MockTransportService) internalCluster().getInstance(TransportService.class, sender);
for (final String receiver : nodeNames) {
senderTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, receiver),
(connection, requestId, action, request, options) -> {
if (action.equals(PreVoteCollector.REQUEST_PRE_VOTE_ACTION_NAME)) {
throw new ElasticsearchException("rejected");
}
connection.sendRequest(requestId, action, request, options);
});
}
}
internalCluster().stopCurrentMasterNode();
assertFalse(internalCluster().client().admin().cluster().prepareHealth()
.setWaitForNodes("3").setWaitForEvents(Priority.LANGUID).get().isTimedOut());
final ClusterState newClusterState
= internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState();
assertThat(newClusterState.nodes().getMasterNode().getName(), equalTo(excludedNodeName));
assertThat(newClusterState.getLastCommittedConfiguration().getNodeIds(), hasItem(newClusterState.nodes().getMasterNodeId()));
}
}