Describe STALE_STATE_CONFIG in ClusterFormationFH (#53878)

We mark cluster states persisted on master-ineligible nodes as
potentially-stale using the voting configuration `{STALE_STATE_CONFIG}` which
prevents these nodes from being elected as master if they are restarted as
master-eligible. Today we do not handle this special voting configuration
differently in the `ClusterFormationFailureHandler`, leading to a mysterious
message `an election requires a node with id [STALE_STATE_CONFIG]` if the
election does not succeed.

This commit adds a special case description for this situation to explain
better why this node cannot win an election.

Closes #53734
This commit is contained in:
David Turner 2020-03-20 19:01:05 +00:00 committed by David Turner
parent 0cfe6d90cc
commit 879e26ec06
3 changed files with 25 additions and 2 deletions

View File

@ -31,6 +31,7 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.gateway.GatewayMetaState;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.threadpool.ThreadPool.Names;
@ -210,7 +211,12 @@ public class ClusterFormationFailureHelper {
assert requiredNodes <= realNodeIds.size() : nodeIds;
if (nodeIds.size() == 1) {
if (nodeIds.contains(GatewayMetaState.STALE_STATE_CONFIG_NODE_ID)) {
return "one or more nodes that have already participated as master-eligible nodes in the cluster but this node was " +
"not master-eligible the last time it joined the cluster";
} else {
return "a node with id " + realNodeIds;
}
} else if (nodeIds.size() == 2) {
return "two nodes with ids " + realNodeIds;
} else {

View File

@ -81,6 +81,13 @@ import static org.elasticsearch.common.util.concurrent.EsExecutors.daemonThreadF
*/
public class GatewayMetaState implements Closeable {
/**
* Fake node ID for a voting configuration written by a master-ineligible data node to indicate that its on-disk state is potentially
* stale (since it is written asynchronously after application, rather than before acceptance). This node ID means that if the node is
* restarted as a master-eligible node then it does not win any elections until it has received a fresh cluster state.
*/
public static final String STALE_STATE_CONFIG_NODE_ID = "STALE_STATE_CONFIG";
// Set by calling start()
private final SetOnce<PersistedState> persistedState = new SetOnce<>();
@ -425,7 +432,7 @@ public class GatewayMetaState implements Closeable {
}
static final CoordinationMetaData.VotingConfiguration staleStateConfiguration =
new CoordinationMetaData.VotingConfiguration(Collections.singleton("STALE_STATE_CONFIG"));
new CoordinationMetaData.VotingConfiguration(Collections.singleton(STALE_STATE_CONFIG_NODE_ID));
static ClusterState resetVotingConfiguration(ClusterState clusterState) {
CoordinationMetaData newCoordinationMetaData = CoordinationMetaData.builder(clusterState.coordinationMetaData())

View File

@ -29,6 +29,7 @@ import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.gateway.GatewayMetaState;
import org.elasticsearch.test.ESTestCase;
import java.util.Arrays;
@ -412,5 +413,14 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
"have discovered [] which is not a quorum; " +
"discovery will continue using [] from hosts providers and [" + otherMasterNode + ", " + localNode +
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0")));
assertThat(new ClusterFormationState(Settings.EMPTY, state(localNode, GatewayMetaState.STALE_STATE_CONFIG_NODE_ID), emptyList(),
emptyList(), 0L, electionStrategy).getDescription(),
is("master not discovered or elected yet, an election requires one or more nodes that have already participated as " +
"master-eligible nodes in the cluster but this node was not master-eligible the last time it joined the cluster, " +
"have discovered [] which is not a quorum; " +
"discovery will continue using [] from hosts providers and [" + localNode +
"] from last-known cluster state; node term 0, last-accepted version 0 in term 0"));
}
}