Gather votes from all nodes (#34335)
Today we accept that some nodes may vote for the wrong master in an election. This is mostly fine because they do end up joining the correct master in the end, but the lack of a vote from every follower may prevent a future desirable reconfiguration from taking place. The solution is to hold another election in a yet-higher term in order to collect a complete set of votes. Elections are somewhat disruptive so we should think carefully about when this election should take place. One option is to wait as late as possible (on the grounds that it might not ever be necessary). This unfortunately makes it harder to predict how an apparently-smoothly-running cluster will react to nodes leaving and joining. Instead we prefer to perform the election as soon as possible in the leader's term, adding "votes from all followers" to the invariants that we expect to hold in a stable cluster. The start of a leader's term is already a somewhat disrupted time for the cluster, so performing another election at this point does not materially change the cluster's behaviour. This change implements the logic needed to trigger a new election in order to satisfy this extra stabilisation condition.
This commit is contained in:
parent
29d7d1d503
commit
03da4f6c51
|
@ -222,8 +222,10 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
ensureTermAtLeast(sourceNode, publishRequest.getAcceptedState().term());
|
ensureTermAtLeast(sourceNode, publishRequest.getAcceptedState().term());
|
||||||
final PublishResponse publishResponse = coordinationState.get().handlePublishRequest(publishRequest);
|
final PublishResponse publishResponse = coordinationState.get().handlePublishRequest(publishRequest);
|
||||||
|
|
||||||
if (sourceNode.equals(getLocalNode()) == false) {
|
if (sourceNode.equals(getLocalNode())) {
|
||||||
becomeFollower("handlePublishRequest", sourceNode);
|
preVoteCollector.update(getPreVoteResponse(), getLocalNode());
|
||||||
|
} else {
|
||||||
|
becomeFollower("handlePublishRequest", sourceNode); // also updates preVoteCollector
|
||||||
}
|
}
|
||||||
|
|
||||||
return new PublishWithJoinResponse(publishResponse,
|
return new PublishWithJoinResponse(publishResponse,
|
||||||
|
@ -254,27 +256,31 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
}
|
}
|
||||||
|
|
||||||
private void updateMaxTermSeen(final long term) {
|
private void updateMaxTermSeen(final long term) {
|
||||||
maxTermSeen.updateAndGet(oldMaxTerm -> Math.max(oldMaxTerm, term));
|
final long updatedMaxTermSeen = maxTermSeen.updateAndGet(oldMaxTerm -> Math.max(oldMaxTerm, term));
|
||||||
// TODO if we are leader here, and there is no publication in flight, then we should bump our term
|
synchronized (mutex) {
|
||||||
// (if we are leader and there _is_ a publication in flight then doing so would cancel the publication, so don't do that, but
|
if (mode == Mode.LEADER && publicationInProgress() == false && updatedMaxTermSeen > getCurrentTerm()) {
|
||||||
// do check for this after the publication completes)
|
// Bump our term. However if there is a publication in flight then doing so would cancel the publication, so don't do that
|
||||||
|
// since we check whether a term bump is needed at the end of the publication too.
|
||||||
|
ensureTermAtLeast(getLocalNode(), updatedMaxTermSeen);
|
||||||
|
startElection();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: make private again after removing term-bump workaround
|
private void startElection() {
|
||||||
void startElection() {
|
|
||||||
synchronized (mutex) {
|
synchronized (mutex) {
|
||||||
// The preVoteCollector is only active while we are candidate, but it does not call this method with synchronisation, so we have
|
// The preVoteCollector is only active while we are candidate, but it does not call this method with synchronisation, so we have
|
||||||
// to check our mode again here.
|
// to check our mode again here.
|
||||||
if (mode == Mode.CANDIDATE) {
|
if (mode == Mode.CANDIDATE) {
|
||||||
final StartJoinRequest startJoinRequest
|
final StartJoinRequest startJoinRequest
|
||||||
= new StartJoinRequest(getLocalNode(), Math.max(getCurrentTerm(), maxTermSeen.get()) + 1);
|
= new StartJoinRequest(getLocalNode(), Math.max(getCurrentTerm(), maxTermSeen.get()) + 1);
|
||||||
|
logger.debug("starting election with {}", startJoinRequest);
|
||||||
getDiscoveredNodes().forEach(node -> joinHelper.sendStartJoinRequest(startJoinRequest, node));
|
getDiscoveredNodes().forEach(node -> joinHelper.sendStartJoinRequest(startJoinRequest, node));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: make private again after removing term-bump workaround
|
private Optional<Join> ensureTermAtLeast(DiscoveryNode sourceNode, long targetTerm) {
|
||||||
Optional<Join> ensureTermAtLeast(DiscoveryNode sourceNode, long targetTerm) {
|
|
||||||
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
|
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
|
||||||
if (getCurrentTerm() < targetTerm) {
|
if (getCurrentTerm() < targetTerm) {
|
||||||
return Optional.of(joinLeaderInTerm(new StartJoinRequest(sourceNode, targetTerm)));
|
return Optional.of(joinLeaderInTerm(new StartJoinRequest(sourceNode, targetTerm)));
|
||||||
|
@ -289,9 +295,10 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
lastJoin = Optional.of(join);
|
lastJoin = Optional.of(join);
|
||||||
peerFinder.setCurrentTerm(getCurrentTerm());
|
peerFinder.setCurrentTerm(getCurrentTerm());
|
||||||
if (mode != Mode.CANDIDATE) {
|
if (mode != Mode.CANDIDATE) {
|
||||||
becomeCandidate("joinLeaderInTerm"); // updates followersChecker
|
becomeCandidate("joinLeaderInTerm"); // updates followersChecker and preVoteCollector
|
||||||
} else {
|
} else {
|
||||||
followersChecker.updateFastResponseState(getCurrentTerm(), mode);
|
followersChecker.updateFastResponseState(getCurrentTerm(), mode);
|
||||||
|
preVoteCollector.update(getPreVoteResponse(), null);
|
||||||
}
|
}
|
||||||
return join;
|
return join;
|
||||||
}
|
}
|
||||||
|
@ -485,6 +492,8 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
assert becomingMaster || getStateForMasterService().nodes().getMasterNodeId() != null : getStateForMasterService();
|
assert becomingMaster || getStateForMasterService().nodes().getMasterNodeId() != null : getStateForMasterService();
|
||||||
assert leaderCheckScheduler == null : leaderCheckScheduler;
|
assert leaderCheckScheduler == null : leaderCheckScheduler;
|
||||||
assert applierState.nodes().getMasterNodeId() == null || getLocalNode().equals(applierState.nodes().getMasterNode());
|
assert applierState.nodes().getMasterNodeId() == null || getLocalNode().equals(applierState.nodes().getMasterNode());
|
||||||
|
assert preVoteCollector.getLeader() == getLocalNode() : preVoteCollector;
|
||||||
|
assert preVoteCollector.getPreVoteResponse().equals(getPreVoteResponse()) : preVoteCollector;
|
||||||
|
|
||||||
final boolean activePublication = currentPublication.map(CoordinatorPublication::isActiveForCurrentLeader).orElse(false);
|
final boolean activePublication = currentPublication.map(CoordinatorPublication::isActiveForCurrentLeader).orElse(false);
|
||||||
if (becomingMaster && activePublication == false) {
|
if (becomingMaster && activePublication == false) {
|
||||||
|
@ -517,6 +526,8 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
assert leaderCheckScheduler != null;
|
assert leaderCheckScheduler != null;
|
||||||
assert followersChecker.getKnownFollowers().isEmpty();
|
assert followersChecker.getKnownFollowers().isEmpty();
|
||||||
assert currentPublication.map(Publication::isCommitted).orElse(true);
|
assert currentPublication.map(Publication::isCommitted).orElse(true);
|
||||||
|
assert preVoteCollector.getLeader().equals(lastKnownLeader.get()) : preVoteCollector;
|
||||||
|
assert preVoteCollector.getPreVoteResponse().equals(getPreVoteResponse()) : preVoteCollector;
|
||||||
} else {
|
} else {
|
||||||
assert mode == Mode.CANDIDATE;
|
assert mode == Mode.CANDIDATE;
|
||||||
assert joinAccumulator instanceof JoinHelper.CandidateJoinAccumulator;
|
assert joinAccumulator instanceof JoinHelper.CandidateJoinAccumulator;
|
||||||
|
@ -528,6 +539,8 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
assert followersChecker.getKnownFollowers().isEmpty();
|
assert followersChecker.getKnownFollowers().isEmpty();
|
||||||
assert applierState.nodes().getMasterNodeId() == null;
|
assert applierState.nodes().getMasterNodeId() == null;
|
||||||
assert currentPublication.map(Publication::isCommitted).orElse(true);
|
assert currentPublication.map(Publication::isCommitted).orElse(true);
|
||||||
|
assert preVoteCollector.getLeader() == null : preVoteCollector;
|
||||||
|
assert preVoteCollector.getPreVoteResponse().equals(getPreVoteResponse()) : preVoteCollector;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -537,7 +550,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
return coordinationState.get().containsJoinVoteFor(localNode);
|
return coordinationState.get().containsJoinVoteFor(localNode);
|
||||||
}
|
}
|
||||||
|
|
||||||
void handleJoin(Join join) {
|
private void handleJoin(Join join) {
|
||||||
synchronized (mutex) {
|
synchronized (mutex) {
|
||||||
ensureTermAtLeast(getLocalNode(), join.getTerm()).ifPresent(this::handleJoin);
|
ensureTermAtLeast(getLocalNode(), join.getTerm()).ifPresent(this::handleJoin);
|
||||||
|
|
||||||
|
@ -547,7 +560,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
try {
|
try {
|
||||||
coordinationState.get().handleJoin(join);
|
coordinationState.get().handleJoin(join);
|
||||||
} catch (CoordinationStateRejectedException e) {
|
} catch (CoordinationStateRejectedException e) {
|
||||||
logger.debug("failed to add join, ignoring", e);
|
logger.debug(new ParameterizedMessage("failed to add {} - ignoring", join), e);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
coordinationState.get().handleJoin(join); // this might fail and bubble up the exception
|
coordinationState.get().handleJoin(join); // this might fail and bubble up the exception
|
||||||
|
@ -753,6 +766,11 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
private final AckListener ackListener;
|
private final AckListener ackListener;
|
||||||
private final ActionListener<Void> publishListener;
|
private final ActionListener<Void> publishListener;
|
||||||
|
|
||||||
|
// We may not have accepted our own state before receiving a join from another node, causing its join to be rejected (we cannot
|
||||||
|
// safely accept a join whose last-accepted term/version is ahead of ours), so store them up and process them at the end.
|
||||||
|
private final List<Join> receivedJoins = new ArrayList<>();
|
||||||
|
private boolean receivedJoinsProcessed;
|
||||||
|
|
||||||
CoordinatorPublication(PublishRequest publishRequest, ListenableFuture<Void> localNodeAckEvent, AckListener ackListener,
|
CoordinatorPublication(PublishRequest publishRequest, ListenableFuture<Void> localNodeAckEvent, AckListener ackListener,
|
||||||
ActionListener<Void> publishListener) {
|
ActionListener<Void> publishListener) {
|
||||||
super(Coordinator.this.settings, publishRequest,
|
super(Coordinator.this.settings, publishRequest,
|
||||||
|
@ -790,6 +808,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
|
|
||||||
assert currentPublication.get() == this;
|
assert currentPublication.get() == this;
|
||||||
currentPublication = Optional.empty();
|
currentPublication = Optional.empty();
|
||||||
|
logger.debug("publication ended unsuccessfully: {}", this);
|
||||||
|
|
||||||
// check if node has not already switched modes (by bumping term)
|
// check if node has not already switched modes (by bumping term)
|
||||||
if (isActiveForCurrentLeader()) {
|
if (isActiveForCurrentLeader()) {
|
||||||
|
@ -812,6 +831,10 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
|
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
|
||||||
assert committed;
|
assert committed;
|
||||||
|
|
||||||
|
receivedJoins.forEach(CoordinatorPublication.this::handleAssociatedJoin);
|
||||||
|
assert receivedJoinsProcessed == false;
|
||||||
|
receivedJoinsProcessed = true;
|
||||||
|
|
||||||
clusterApplier.onNewClusterState(CoordinatorPublication.this.toString(), () -> applierState,
|
clusterApplier.onNewClusterState(CoordinatorPublication.this.toString(), () -> applierState,
|
||||||
new ClusterApplyListener() {
|
new ClusterApplyListener() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -828,6 +851,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
synchronized (mutex) {
|
synchronized (mutex) {
|
||||||
assert currentPublication.get() == CoordinatorPublication.this;
|
assert currentPublication.get() == CoordinatorPublication.this;
|
||||||
currentPublication = Optional.empty();
|
currentPublication = Optional.empty();
|
||||||
|
logger.debug("publication ended successfully: {}", CoordinatorPublication.this);
|
||||||
// trigger term bump if new term was found during publication
|
// trigger term bump if new term was found during publication
|
||||||
updateMaxTermSeen(getCurrentTerm());
|
updateMaxTermSeen(getCurrentTerm());
|
||||||
}
|
}
|
||||||
|
@ -850,6 +874,13 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
}, EsExecutors.newDirectExecutorService());
|
}, EsExecutors.newDirectExecutorService());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void handleAssociatedJoin(Join join) {
|
||||||
|
if (join.getTerm() == getCurrentTerm() && hasJoinVoteFrom(join.getSourceNode()) == false) {
|
||||||
|
logger.trace("handling {}", join);
|
||||||
|
handleJoin(join);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected boolean isPublishQuorum(CoordinationState.VoteCollection votes) {
|
protected boolean isPublishQuorum(CoordinationState.VoteCollection votes) {
|
||||||
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
|
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
|
||||||
|
@ -867,10 +898,26 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
@Override
|
@Override
|
||||||
protected void onJoin(Join join) {
|
protected void onJoin(Join join) {
|
||||||
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
|
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
|
||||||
if (join.getTerm() == getCurrentTerm()) {
|
if (receivedJoinsProcessed) {
|
||||||
handleJoin(join);
|
// a late response may arrive after the state has been locally applied, meaning that receivedJoins has already been
|
||||||
|
// processed, so we have to handle this late response here.
|
||||||
|
handleAssociatedJoin(join);
|
||||||
|
} else {
|
||||||
|
receivedJoins.add(join);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void onMissingJoin(DiscoveryNode discoveryNode) {
|
||||||
|
assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
|
||||||
|
// The remote node did not include a join vote in its publish response. We do not persist joins, so it could be that the remote
|
||||||
|
// node voted for us and then rebooted, or it could be that it voted for a different node in this term. If we don't have a copy
|
||||||
|
// of a join from this node then we assume the latter and bump our term to obtain a vote from this node.
|
||||||
|
if (hasJoinVoteFrom(discoveryNode) == false) {
|
||||||
|
final long term = publishRequest.getAcceptedState().term();
|
||||||
|
logger.debug("onMissingJoin: no join vote from {}, bumping term to exceed {}", discoveryNode, term);
|
||||||
|
updateMaxTermSeen(term + 1);
|
||||||
}
|
}
|
||||||
// TODO: what to do on missing join?
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -79,19 +79,33 @@ public class PreVoteCollector extends AbstractComponent {
|
||||||
return preVotingRound;
|
return preVotingRound;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// only for testing
|
||||||
|
PreVoteResponse getPreVoteResponse() {
|
||||||
|
return state.v2();
|
||||||
|
}
|
||||||
|
|
||||||
|
// only for testing
|
||||||
|
@Nullable
|
||||||
|
DiscoveryNode getLeader() {
|
||||||
|
return state.v1();
|
||||||
|
}
|
||||||
|
|
||||||
public void update(final PreVoteResponse preVoteResponse, @Nullable final DiscoveryNode leader) {
|
public void update(final PreVoteResponse preVoteResponse, @Nullable final DiscoveryNode leader) {
|
||||||
logger.trace("updating with preVoteResponse={}, leader={}", preVoteResponse, leader);
|
logger.trace("updating with preVoteResponse={}, leader={}", preVoteResponse, leader);
|
||||||
state = new Tuple<>(leader, preVoteResponse);
|
state = new Tuple<>(leader, preVoteResponse);
|
||||||
}
|
}
|
||||||
|
|
||||||
private PreVoteResponse handlePreVoteRequest(final PreVoteRequest request) {
|
private PreVoteResponse handlePreVoteRequest(final PreVoteRequest request) {
|
||||||
// TODO if we are a leader and the max term seen exceeds our term then we need to bump our term
|
updateMaxTermSeen.accept(request.getCurrentTerm());
|
||||||
|
|
||||||
Tuple<DiscoveryNode, PreVoteResponse> state = this.state;
|
Tuple<DiscoveryNode, PreVoteResponse> state = this.state;
|
||||||
|
assert state != null : "received pre-vote request before fully initialised";
|
||||||
|
|
||||||
final DiscoveryNode leader = state.v1();
|
final DiscoveryNode leader = state.v1();
|
||||||
|
final PreVoteResponse response = state.v2();
|
||||||
|
|
||||||
if (leader == null) {
|
if (leader == null) {
|
||||||
return state.v2();
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (leader.equals(request.getSourceNode())) {
|
if (leader.equals(request.getSourceNode())) {
|
||||||
|
@ -100,7 +114,7 @@ public class PreVoteCollector extends AbstractComponent {
|
||||||
// major drawback in offering a join to our old leader. The advantage of this is that it makes it slightly more likely that the
|
// major drawback in offering a join to our old leader. The advantage of this is that it makes it slightly more likely that the
|
||||||
// leader won't change, and also that its re-election will happen more quickly than if it had to wait for a quorum of followers
|
// leader won't change, and also that its re-election will happen more quickly than if it had to wait for a quorum of followers
|
||||||
// to also detect its failure.
|
// to also detect its failure.
|
||||||
return state.v2();
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
throw new CoordinationStateRejectedException("rejecting " + request + " as there is already a leader");
|
throw new CoordinationStateRejectedException("rejecting " + request + " as there is already a leader");
|
||||||
|
@ -141,11 +155,7 @@ public class PreVoteCollector extends AbstractComponent {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void handleException(TransportException exp) {
|
public void handleException(TransportException exp) {
|
||||||
if (exp.getRootCause() instanceof CoordinationStateRejectedException) {
|
logger.debug(new ParameterizedMessage("{} failed", this), exp);
|
||||||
logger.debug("{} failed: {}", this, exp.getRootCause().getMessage());
|
|
||||||
} else {
|
|
||||||
logger.debug(new ParameterizedMessage("{} failed", this), exp);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -172,6 +172,8 @@ public abstract class Publication extends AbstractComponent {
|
||||||
|
|
||||||
protected abstract void onJoin(Join join);
|
protected abstract void onJoin(Join join);
|
||||||
|
|
||||||
|
protected abstract void onMissingJoin(DiscoveryNode discoveryNode);
|
||||||
|
|
||||||
protected abstract void sendPublishRequest(DiscoveryNode destination, PublishRequest publishRequest,
|
protected abstract void sendPublishRequest(DiscoveryNode destination, PublishRequest publishRequest,
|
||||||
ActionListener<PublishWithJoinResponse> responseActionListener);
|
ActionListener<PublishWithJoinResponse> responseActionListener);
|
||||||
|
|
||||||
|
@ -301,10 +303,16 @@ public abstract class Publication extends AbstractComponent {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
response.getJoin().ifPresent(join -> {
|
if (response.getJoin().isPresent()) {
|
||||||
|
final Join join = response.getJoin().get();
|
||||||
assert discoveryNode.equals(join.getSourceNode());
|
assert discoveryNode.equals(join.getSourceNode());
|
||||||
|
assert join.getTerm() == response.getPublishResponse().getTerm() : response;
|
||||||
|
logger.trace("handling join within publish response: {}", join);
|
||||||
onJoin(join);
|
onJoin(join);
|
||||||
});
|
} else {
|
||||||
|
logger.trace("publish response from {} contained no join", discoveryNode);
|
||||||
|
onMissingJoin(discoveryNode);
|
||||||
|
}
|
||||||
|
|
||||||
assert state == PublicationTargetState.SENT_PUBLISH_REQUEST : state + " -> " + PublicationTargetState.WAITING_FOR_QUORUM;
|
assert state == PublicationTargetState.SENT_PUBLISH_REQUEST : state + " -> " + PublicationTargetState.WAITING_FOR_QUORUM;
|
||||||
state = PublicationTargetState.WAITING_FOR_QUORUM;
|
state = PublicationTargetState.WAITING_FOR_QUORUM;
|
||||||
|
|
|
@ -30,8 +30,8 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public class PublishResponse implements Writeable {
|
public class PublishResponse implements Writeable {
|
||||||
|
|
||||||
protected final long term;
|
private final long term;
|
||||||
protected final long version;
|
private final long version;
|
||||||
|
|
||||||
public PublishResponse(long term, long version) {
|
public PublishResponse(long term, long version) {
|
||||||
assert term >= 0;
|
assert term >= 0;
|
||||||
|
|
|
@ -25,10 +25,12 @@ import org.elasticsearch.cluster.ClusterState;
|
||||||
import org.elasticsearch.cluster.ClusterState.VotingConfiguration;
|
import org.elasticsearch.cluster.ClusterState.VotingConfiguration;
|
||||||
import org.elasticsearch.cluster.metadata.MetaData;
|
import org.elasticsearch.cluster.metadata.MetaData;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNode;
|
import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||||
|
import org.elasticsearch.cluster.node.DiscoveryNode.Role;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
||||||
import org.elasticsearch.common.UUIDs;
|
import org.elasticsearch.common.UUIDs;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.cluster.coordination.CoordinationState.PersistedState;
|
import org.elasticsearch.cluster.coordination.CoordinationState.PersistedState;
|
||||||
|
import org.elasticsearch.common.transport.TransportAddress;
|
||||||
import org.elasticsearch.common.util.set.Sets;
|
import org.elasticsearch.common.util.set.Sets;
|
||||||
import org.elasticsearch.node.Node;
|
import org.elasticsearch.node.Node;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
@ -37,6 +39,7 @@ import org.junit.Before;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.EnumSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -82,7 +85,11 @@ public class CoordinationStateTests extends ESTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static DiscoveryNode createNode(String id) {
|
public static DiscoveryNode createNode(String id) {
|
||||||
return new DiscoveryNode(id, buildNewFakeTransportAddress(), Version.CURRENT);
|
final TransportAddress address = buildNewFakeTransportAddress();
|
||||||
|
return new DiscoveryNode("", id,
|
||||||
|
UUIDs.randomBase64UUID(random()), // generated deterministically for repeatable tests
|
||||||
|
address.address().getHostString(), address.getAddress(), address, Collections.emptyMap(),
|
||||||
|
EnumSet.allOf(Role.class), Version.CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSetInitialState() {
|
public void testSetInitialState() {
|
||||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.cluster.coordination;
|
||||||
import org.apache.logging.log4j.CloseableThreadContext;
|
import org.apache.logging.log4j.CloseableThreadContext;
|
||||||
import org.apache.logging.log4j.message.ParameterizedMessage;
|
import org.apache.logging.log4j.message.ParameterizedMessage;
|
||||||
import org.elasticsearch.ElasticsearchException;
|
import org.elasticsearch.ElasticsearchException;
|
||||||
import org.elasticsearch.Version;
|
|
||||||
import org.elasticsearch.cluster.ClusterName;
|
import org.elasticsearch.cluster.ClusterName;
|
||||||
import org.elasticsearch.cluster.ClusterState;
|
import org.elasticsearch.cluster.ClusterState;
|
||||||
import org.elasticsearch.cluster.ClusterState.VotingConfiguration;
|
import org.elasticsearch.cluster.ClusterState.VotingConfiguration;
|
||||||
|
@ -32,10 +31,8 @@ import org.elasticsearch.cluster.coordination.CoordinationState.PersistedState;
|
||||||
import org.elasticsearch.cluster.coordination.CoordinationStateTests.InMemoryPersistedState;
|
import org.elasticsearch.cluster.coordination.CoordinationStateTests.InMemoryPersistedState;
|
||||||
import org.elasticsearch.cluster.coordination.CoordinatorTests.Cluster.ClusterNode;
|
import org.elasticsearch.cluster.coordination.CoordinatorTests.Cluster.ClusterNode;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNode;
|
import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||||
import org.elasticsearch.cluster.node.DiscoveryNode.Role;
|
|
||||||
import org.elasticsearch.cluster.service.ClusterApplier;
|
import org.elasticsearch.cluster.service.ClusterApplier;
|
||||||
import org.elasticsearch.common.Randomness;
|
import org.elasticsearch.common.Randomness;
|
||||||
import org.elasticsearch.common.UUIDs;
|
|
||||||
import org.elasticsearch.common.component.AbstractComponent;
|
import org.elasticsearch.common.component.AbstractComponent;
|
||||||
import org.elasticsearch.common.settings.ClusterSettings;
|
import org.elasticsearch.common.settings.ClusterSettings;
|
||||||
import org.elasticsearch.common.settings.Setting;
|
import org.elasticsearch.common.settings.Setting;
|
||||||
|
@ -54,8 +51,6 @@ import org.junit.Before;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.EnumSet;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -75,6 +70,7 @@ import static org.elasticsearch.cluster.coordination.CoordinationStateTests.valu
|
||||||
import static org.elasticsearch.cluster.coordination.Coordinator.Mode.CANDIDATE;
|
import static org.elasticsearch.cluster.coordination.Coordinator.Mode.CANDIDATE;
|
||||||
import static org.elasticsearch.cluster.coordination.Coordinator.Mode.FOLLOWER;
|
import static org.elasticsearch.cluster.coordination.Coordinator.Mode.FOLLOWER;
|
||||||
import static org.elasticsearch.cluster.coordination.Coordinator.Mode.LEADER;
|
import static org.elasticsearch.cluster.coordination.Coordinator.Mode.LEADER;
|
||||||
|
import static org.elasticsearch.cluster.coordination.Coordinator.PUBLISH_TIMEOUT_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.CoordinatorTests.Cluster.DEFAULT_DELAY_VARIABILITY;
|
import static org.elasticsearch.cluster.coordination.CoordinatorTests.Cluster.DEFAULT_DELAY_VARIABILITY;
|
||||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
|
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
|
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
|
||||||
|
@ -187,15 +183,26 @@ public class CoordinatorTests extends ESTestCase {
|
||||||
logger.info("--> blackholing leader {}", originalLeader);
|
logger.info("--> blackholing leader {}", originalLeader);
|
||||||
originalLeader.blackhole();
|
originalLeader.blackhole();
|
||||||
|
|
||||||
|
// This stabilisation time bound is undesirably long. TODO try and reduce it.
|
||||||
cluster.stabilise(Math.max(
|
cluster.stabilise(Math.max(
|
||||||
// first wait for all the followers to notice the leader has gone
|
// first wait for all the followers to notice the leader has gone
|
||||||
(defaultMillis(LEADER_CHECK_INTERVAL_SETTING) + defaultMillis(LEADER_CHECK_TIMEOUT_SETTING))
|
(defaultMillis(LEADER_CHECK_INTERVAL_SETTING) + defaultMillis(LEADER_CHECK_TIMEOUT_SETTING))
|
||||||
* defaultInt(LEADER_CHECK_RETRY_COUNT_SETTING)
|
* defaultInt(LEADER_CHECK_RETRY_COUNT_SETTING)
|
||||||
// then wait for a follower to be promoted to leader
|
// then wait for a follower to be promoted to leader
|
||||||
+ DEFAULT_ELECTION_DELAY
|
+ DEFAULT_ELECTION_DELAY
|
||||||
// then wait for the new leader to notice that the old leader is unresponsive
|
// and the first publication times out because of the unresponsive node
|
||||||
+ (defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING) + defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING))
|
+ defaultMillis(PUBLISH_TIMEOUT_SETTING)
|
||||||
* defaultInt(FOLLOWER_CHECK_RETRY_COUNT_SETTING)
|
// there might be a term bump causing another election
|
||||||
|
+ DEFAULT_ELECTION_DELAY
|
||||||
|
|
||||||
|
// then wait for both of:
|
||||||
|
+ Math.max(
|
||||||
|
// 1. the term bumping publication to time out
|
||||||
|
defaultMillis(PUBLISH_TIMEOUT_SETTING),
|
||||||
|
// 2. the new leader to notice that the old leader is unresponsive
|
||||||
|
(defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING) + defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING))
|
||||||
|
* defaultInt(FOLLOWER_CHECK_RETRY_COUNT_SETTING))
|
||||||
|
|
||||||
// then wait for the new leader to commit a state without the old leader
|
// then wait for the new leader to commit a state without the old leader
|
||||||
+ DEFAULT_CLUSTER_STATE_UPDATE_DELAY,
|
+ DEFAULT_CLUSTER_STATE_UPDATE_DELAY,
|
||||||
|
|
||||||
|
@ -205,6 +212,7 @@ public class CoordinatorTests extends ESTestCase {
|
||||||
// then wait for the leader to try and commit a state removing them, causing it to stand down
|
// then wait for the leader to try and commit a state removing them, causing it to stand down
|
||||||
+ DEFAULT_CLUSTER_STATE_UPDATE_DELAY
|
+ DEFAULT_CLUSTER_STATE_UPDATE_DELAY
|
||||||
));
|
));
|
||||||
|
|
||||||
assertThat(cluster.getAnyLeader().getId(), not(equalTo(originalLeader.getId())));
|
assertThat(cluster.getAnyLeader().getId(), not(equalTo(originalLeader.getId())));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -599,31 +607,33 @@ public class CoordinatorTests extends ESTestCase {
|
||||||
assertThat("stabilisation requires default delay variability (and proper cleanup of raised variability)",
|
assertThat("stabilisation requires default delay variability (and proper cleanup of raised variability)",
|
||||||
deterministicTaskQueue.getExecutionDelayVariabilityMillis(), lessThanOrEqualTo(DEFAULT_DELAY_VARIABILITY));
|
deterministicTaskQueue.getExecutionDelayVariabilityMillis(), lessThanOrEqualTo(DEFAULT_DELAY_VARIABILITY));
|
||||||
runFor(stabilisationDurationMillis, "stabilising");
|
runFor(stabilisationDurationMillis, "stabilising");
|
||||||
|
fixLag();
|
||||||
|
assertUniqueLeaderAndExpectedModes();
|
||||||
|
}
|
||||||
|
|
||||||
// TODO remove when term-bumping is enabled
|
// TODO remove this when lag detection is implemented
|
||||||
final long maxTerm = clusterNodes.stream().map(n -> n.coordinator.getCurrentTerm()).max(Long::compare).orElse(0L);
|
void fixLag() {
|
||||||
final long maxLeaderTerm = clusterNodes.stream().filter(n -> n.coordinator.getMode() == Coordinator.Mode.LEADER)
|
final ClusterNode leader = getAnyLeader();
|
||||||
.map(n -> n.coordinator.getCurrentTerm()).max(Long::compare).orElse(0L);
|
final long leaderVersion = leader.coordinator.getLastAcceptedState().version();
|
||||||
|
final long minVersion = clusterNodes.stream()
|
||||||
|
.filter(n -> isConnectedPair(n, leader))
|
||||||
|
.map(n -> n.coordinator.getLastAcceptedState().version()).min(Long::compare).orElse(Long.MIN_VALUE);
|
||||||
|
|
||||||
if (maxLeaderTerm < maxTerm) {
|
assert minVersion >= 0;
|
||||||
logger.info("--> forcing a term bump, maxTerm={}, maxLeaderTerm={}", maxTerm, maxLeaderTerm);
|
if (minVersion < leaderVersion) {
|
||||||
final ClusterNode leader = getAnyLeader();
|
logger.info("--> publishing a value to fix lag, leaderVersion={}, minVersion={}", leaderVersion, minVersion);
|
||||||
onNode(leader.getLocalNode(), () -> {
|
onNode(leader.getLocalNode(), () -> {
|
||||||
synchronized (leader.coordinator.mutex) {
|
synchronized (leader.coordinator.mutex) {
|
||||||
leader.coordinator.ensureTermAtLeast(leader.localNode, maxTerm + 1);
|
leader.submitValue(randomLong());
|
||||||
}
|
}
|
||||||
leader.coordinator.startElection();
|
|
||||||
}).run();
|
}).run();
|
||||||
runFor(DEFAULT_ELECTION_DELAY, "re-stabilising after term bump");
|
|
||||||
}
|
}
|
||||||
logger.info("--> end of stabilisation");
|
runFor(DEFAULT_CLUSTER_STATE_UPDATE_DELAY, "re-stabilising after lag-fixing publication");
|
||||||
|
|
||||||
assertUniqueLeaderAndExpectedModes();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void runFor(long runDurationMillis, String description) {
|
void runFor(long runDurationMillis, String description) {
|
||||||
final long endTime = deterministicTaskQueue.getCurrentTimeMillis() + runDurationMillis;
|
final long endTime = deterministicTaskQueue.getCurrentTimeMillis() + runDurationMillis;
|
||||||
logger.info("----> runFor({}ms) running until [{}ms]: {}", runDurationMillis, endTime, description);
|
logger.info("--> runFor({}ms) running until [{}ms]: {}", runDurationMillis, endTime, description);
|
||||||
|
|
||||||
while (deterministicTaskQueue.getCurrentTimeMillis() < endTime) {
|
while (deterministicTaskQueue.getCurrentTimeMillis() < endTime) {
|
||||||
|
|
||||||
|
@ -648,7 +658,7 @@ public class CoordinatorTests extends ESTestCase {
|
||||||
deterministicTaskQueue.advanceTime();
|
deterministicTaskQueue.advanceTime();
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("----> runFor({}ms) completed run until [{}ms]: {}", runDurationMillis, endTime, description);
|
logger.info("--> runFor({}ms) completed run until [{}ms]: {}", runDurationMillis, endTime, description);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isConnectedPair(ClusterNode n1, ClusterNode n2) {
|
private boolean isConnectedPair(ClusterNode n1, ClusterNode n2) {
|
||||||
|
@ -677,7 +687,7 @@ public class CoordinatorTests extends ESTestCase {
|
||||||
if (isConnectedPair(leader, clusterNode)) {
|
if (isConnectedPair(leader, clusterNode)) {
|
||||||
assertThat(nodeId + " is a follower", clusterNode.coordinator.getMode(), is(FOLLOWER));
|
assertThat(nodeId + " is a follower", clusterNode.coordinator.getMode(), is(FOLLOWER));
|
||||||
assertThat(nodeId + " has the same term as the leader", clusterNode.coordinator.getCurrentTerm(), is(leaderTerm));
|
assertThat(nodeId + " has the same term as the leader", clusterNode.coordinator.getCurrentTerm(), is(leaderTerm));
|
||||||
// TODO assert that this node has actually voted for the leader in this term
|
assertTrue(nodeId + " has voted for the leader", leader.coordinator.hasJoinVoteFrom(clusterNode.getLocalNode()));
|
||||||
// TODO assert that this node's accepted and committed states are the same as the leader's
|
// TODO assert that this node's accepted and committed states are the same as the leader's
|
||||||
|
|
||||||
assertTrue(nodeId + " is in the leader's applied state",
|
assertTrue(nodeId + " is in the leader's applied state",
|
||||||
|
@ -754,13 +764,7 @@ public class CoordinatorTests extends ESTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private DiscoveryNode createDiscoveryNode() {
|
private DiscoveryNode createDiscoveryNode() {
|
||||||
final TransportAddress transportAddress = buildNewFakeTransportAddress();
|
return CoordinationStateTests.createNode(nodeIdFromIndex(nodeIndex));
|
||||||
// Generate the ephemeral ID deterministically, for repeatable tests. This means we have to pass everything else into the
|
|
||||||
// constructor explicitly too.
|
|
||||||
return new DiscoveryNode("", nodeIdFromIndex(nodeIndex), UUIDs.randomBase64UUID(random()),
|
|
||||||
transportAddress.address().getHostString(),
|
|
||||||
transportAddress.getAddress(), transportAddress, Collections.emptyMap(),
|
|
||||||
EnumSet.allOf(Role.class), Version.CURRENT);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setUp() {
|
private void setUp() {
|
||||||
|
|
|
@ -101,6 +101,7 @@ public class PublicationTests extends ESTestCase {
|
||||||
Map<DiscoveryNode, ActionListener<PublishWithJoinResponse>> pendingPublications = new HashMap<>();
|
Map<DiscoveryNode, ActionListener<PublishWithJoinResponse>> pendingPublications = new HashMap<>();
|
||||||
Map<DiscoveryNode, ActionListener<TransportResponse.Empty>> pendingCommits = new HashMap<>();
|
Map<DiscoveryNode, ActionListener<TransportResponse.Empty>> pendingCommits = new HashMap<>();
|
||||||
Map<DiscoveryNode, Join> joins = new HashMap<>();
|
Map<DiscoveryNode, Join> joins = new HashMap<>();
|
||||||
|
Set<DiscoveryNode> missingJoins = new HashSet<>();
|
||||||
|
|
||||||
MockPublication(Settings settings, PublishRequest publishRequest, Discovery.AckListener ackListener,
|
MockPublication(Settings settings, PublishRequest publishRequest, Discovery.AckListener ackListener,
|
||||||
LongSupplier currentTimeSupplier) {
|
LongSupplier currentTimeSupplier) {
|
||||||
|
@ -120,6 +121,11 @@ public class PublicationTests extends ESTestCase {
|
||||||
assertNull(joins.put(join.getSourceNode(), join));
|
assertNull(joins.put(join.getSourceNode(), join));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void onMissingJoin(DiscoveryNode discoveryNode) {
|
||||||
|
assertTrue(missingJoins.add(discoveryNode));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void sendPublishRequest(DiscoveryNode destination, PublishRequest publishRequest,
|
protected void sendPublishRequest(DiscoveryNode destination, PublishRequest publishRequest,
|
||||||
ActionListener<PublishWithJoinResponse> responseActionListener) {
|
ActionListener<PublishWithJoinResponse> responseActionListener) {
|
||||||
|
@ -182,14 +188,16 @@ public class PublicationTests extends ESTestCase {
|
||||||
assertNotEquals(processedNode1PublishResponse.get(), publication.pendingCommits.isEmpty());
|
assertNotEquals(processedNode1PublishResponse.get(), publication.pendingCommits.isEmpty());
|
||||||
assertFalse(publication.joins.containsKey(e.getKey()));
|
assertFalse(publication.joins.containsKey(e.getKey()));
|
||||||
PublishWithJoinResponse publishWithJoinResponse = new PublishWithJoinResponse(publishResponse,
|
PublishWithJoinResponse publishWithJoinResponse = new PublishWithJoinResponse(publishResponse,
|
||||||
randomBoolean() ? Optional.empty() : Optional.of(new Join(e.getKey(), randomFrom(n1, n2, n3), randomNonNegativeLong(),
|
randomBoolean() ? Optional.empty() : Optional.of(new Join(e.getKey(), randomFrom(n1, n2, n3), publishResponse.getTerm(),
|
||||||
randomNonNegativeLong(), randomNonNegativeLong())));
|
randomNonNegativeLong(), randomNonNegativeLong())));
|
||||||
e.getValue().onResponse(publishWithJoinResponse);
|
e.getValue().onResponse(publishWithJoinResponse);
|
||||||
if (publishWithJoinResponse.getJoin().isPresent()) {
|
if (publishWithJoinResponse.getJoin().isPresent()) {
|
||||||
assertTrue(publication.joins.containsKey(e.getKey()));
|
assertTrue(publication.joins.containsKey(e.getKey()));
|
||||||
|
assertFalse(publication.missingJoins.contains(e.getKey()));
|
||||||
assertEquals(publishWithJoinResponse.getJoin().get(), publication.joins.get(e.getKey()));
|
assertEquals(publishWithJoinResponse.getJoin().get(), publication.joins.get(e.getKey()));
|
||||||
} else {
|
} else {
|
||||||
assertFalse(publication.joins.containsKey(e.getKey()));
|
assertFalse(publication.joins.containsKey(e.getKey()));
|
||||||
|
assertTrue(publication.missingJoins.contains(e.getKey()));
|
||||||
}
|
}
|
||||||
if (e.getKey().equals(n1)) {
|
if (e.getKey().equals(n1)) {
|
||||||
processedNode1PublishResponse.set(true);
|
processedNode1PublishResponse.set(true);
|
||||||
|
|
Loading…
Reference in New Issue