Extend duration of fixLag() (#34364)
Today, fixLag() waits for a new cluster state to be committed. However, it does not account for the fact that a term bump may occur, requiring a new election to take place after the cluster state is committed. This change fixes this.
This commit is contained in:
parent
a32e303b0c
commit
d98199df14
|
@ -63,7 +63,6 @@ import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
|
||||||
import java.util.function.Supplier;
|
import java.util.function.Supplier;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ -103,7 +102,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
private Releasable prevotingRound;
|
private Releasable prevotingRound;
|
||||||
@Nullable
|
@Nullable
|
||||||
private Releasable leaderCheckScheduler;
|
private Releasable leaderCheckScheduler;
|
||||||
private AtomicLong maxTermSeen = new AtomicLong();
|
private long maxTermSeen;
|
||||||
|
|
||||||
private Mode mode;
|
private Mode mode;
|
||||||
private Optional<DiscoveryNode> lastKnownLeader;
|
private Optional<DiscoveryNode> lastKnownLeader;
|
||||||
|
@ -259,13 +258,19 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
}
|
}
|
||||||
|
|
||||||
private void updateMaxTermSeen(final long term) {
|
private void updateMaxTermSeen(final long term) {
|
||||||
final long updatedMaxTermSeen = maxTermSeen.updateAndGet(oldMaxTerm -> Math.max(oldMaxTerm, term));
|
|
||||||
synchronized (mutex) {
|
synchronized (mutex) {
|
||||||
if (mode == Mode.LEADER && publicationInProgress() == false && updatedMaxTermSeen > getCurrentTerm()) {
|
maxTermSeen = Math.max(maxTermSeen, term);
|
||||||
|
final long currentTerm = getCurrentTerm();
|
||||||
|
if (mode == Mode.LEADER && maxTermSeen > currentTerm) {
|
||||||
// Bump our term. However if there is a publication in flight then doing so would cancel the publication, so don't do that
|
// Bump our term. However if there is a publication in flight then doing so would cancel the publication, so don't do that
|
||||||
// since we check whether a term bump is needed at the end of the publication too.
|
// since we check whether a term bump is needed at the end of the publication too.
|
||||||
ensureTermAtLeast(getLocalNode(), updatedMaxTermSeen);
|
if (publicationInProgress()) {
|
||||||
startElection();
|
logger.debug("updateMaxTermSeen: maxTermSeen = {} > currentTerm = {}, enqueueing term bump",
|
||||||
|
maxTermSeen, currentTerm);
|
||||||
|
} else {
|
||||||
|
ensureTermAtLeast(getLocalNode(), maxTermSeen);
|
||||||
|
startElection();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -276,7 +281,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
|
||||||
// to check our mode again here.
|
// to check our mode again here.
|
||||||
if (mode == Mode.CANDIDATE) {
|
if (mode == Mode.CANDIDATE) {
|
||||||
final StartJoinRequest startJoinRequest
|
final StartJoinRequest startJoinRequest
|
||||||
= new StartJoinRequest(getLocalNode(), Math.max(getCurrentTerm(), maxTermSeen.get()) + 1);
|
= new StartJoinRequest(getLocalNode(), Math.max(getCurrentTerm(), maxTermSeen) + 1);
|
||||||
logger.debug("starting election with {}", startJoinRequest);
|
logger.debug("starting election with {}", startJoinRequest);
|
||||||
getDiscoveredNodes().forEach(node -> joinHelper.sendStartJoinRequest(startJoinRequest, node));
|
getDiscoveredNodes().forEach(node -> joinHelper.sendStartJoinRequest(startJoinRequest, node));
|
||||||
}
|
}
|
||||||
|
|
|
@ -705,7 +705,10 @@ public class CoordinatorTests extends ESTestCase {
|
||||||
leader.submitValue(randomLong());
|
leader.submitValue(randomLong());
|
||||||
}
|
}
|
||||||
}).run();
|
}).run();
|
||||||
runFor(DEFAULT_CLUSTER_STATE_UPDATE_DELAY, "re-stabilising after lag-fixing publication");
|
runFor(DEFAULT_CLUSTER_STATE_UPDATE_DELAY
|
||||||
|
// may need to bump terms too
|
||||||
|
+ DEFAULT_ELECTION_DELAY,
|
||||||
|
"re-stabilising after lag-fixing publication");
|
||||||
} else {
|
} else {
|
||||||
logger.info("--> fixLag found no lag, leader={}, leaderVersion={}, minVersion={}", leader, leaderVersion, minVersion);
|
logger.info("--> fixLag found no lag, leader={}, leaderVersion={}, minVersion={}", leader, leaderVersion, minVersion);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue