mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-14 00:45:30 +00:00
Account for election duration (#34362)
Today we may schedule two elections very close together, which can cause the first election to fail even if there are no other nodes. This change adds a delay in between subsequent elections on the same node, effectively allowing time for each election to complete before scheduling the next one.
This commit is contained in:
parent
52a3a19551
commit
a32e303b0c
@ -48,6 +48,7 @@ public class ElectionSchedulerFactory extends AbstractComponent {
|
||||
private static final String ELECTION_INITIAL_TIMEOUT_SETTING_KEY = "cluster.election.initial_timeout";
|
||||
private static final String ELECTION_BACK_OFF_TIME_SETTING_KEY = "cluster.election.back_off_time";
|
||||
private static final String ELECTION_MAX_TIMEOUT_SETTING_KEY = "cluster.election.max_timeout";
|
||||
private static final String ELECTION_DURATION_SETTING_KEY = "cluster.election.duration";
|
||||
|
||||
/*
|
||||
* The first election is scheduled to occur a random number of milliseconds after the scheduler is started, where the random number of
|
||||
@ -59,6 +60,8 @@ public class ElectionSchedulerFactory extends AbstractComponent {
|
||||
* number of milliseconds is chosen uniformly from
|
||||
*
|
||||
* (0, min(ELECTION_INITIAL_TIMEOUT_SETTING + (n-1) * ELECTION_BACK_OFF_TIME_SETTING, ELECTION_MAX_TIMEOUT_SETTING)]
|
||||
*
|
||||
* Each election lasts up to ELECTION_DURATION_SETTING.
|
||||
*/
|
||||
|
||||
public static final Setting<TimeValue> ELECTION_INITIAL_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_INITIAL_TIMEOUT_SETTING_KEY,
|
||||
@ -70,9 +73,13 @@ public class ElectionSchedulerFactory extends AbstractComponent {
|
||||
public static final Setting<TimeValue> ELECTION_MAX_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_MAX_TIMEOUT_SETTING_KEY,
|
||||
TimeValue.timeValueSeconds(10), TimeValue.timeValueMillis(200), TimeValue.timeValueSeconds(300), Property.NodeScope);
|
||||
|
||||
public static final Setting<TimeValue> ELECTION_DURATION_SETTING = Setting.timeSetting(ELECTION_DURATION_SETTING_KEY,
|
||||
TimeValue.timeValueMillis(500), TimeValue.timeValueMillis(1), TimeValue.timeValueSeconds(300), Property.NodeScope);
|
||||
|
||||
private final TimeValue initialTimeout;
|
||||
private final TimeValue backoffTime;
|
||||
private final TimeValue maxTimeout;
|
||||
private final TimeValue duration;
|
||||
private final ThreadPool threadPool;
|
||||
private final Random random;
|
||||
|
||||
@ -85,6 +92,7 @@ public class ElectionSchedulerFactory extends AbstractComponent {
|
||||
initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings);
|
||||
backoffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings);
|
||||
maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings);
|
||||
duration = ELECTION_DURATION_SETTING.get(settings);
|
||||
|
||||
if (maxTimeout.millis() < initialTimeout.millis()) {
|
||||
throw new IllegalArgumentException(new ParameterizedMessage("[{}] is [{}], but must be at least [{}] which is [{}]",
|
||||
@ -154,15 +162,11 @@ public class ElectionSchedulerFactory extends AbstractComponent {
|
||||
protected void doRun() {
|
||||
if (isClosed.get()) {
|
||||
logger.debug("{} not starting election", this);
|
||||
return;
|
||||
} else {
|
||||
logger.debug("{} starting election", this);
|
||||
scheduleNextElection(duration, scheduledRunnable);
|
||||
scheduledRunnable.run();
|
||||
}
|
||||
logger.debug("{} starting election", this);
|
||||
scheduledRunnable.run();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onAfter() {
|
||||
scheduleNextElection(TimeValue.ZERO, scheduledRunnable);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -450,6 +450,7 @@ public final class ClusterSettings extends AbstractScopedSettings {
|
||||
ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING,
|
||||
ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING,
|
||||
ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING,
|
||||
ElectionSchedulerFactory.ELECTION_DURATION_SETTING,
|
||||
Coordinator.PUBLISH_TIMEOUT_SETTING,
|
||||
JoinHelper.JOIN_TIMEOUT_SETTING
|
||||
)));
|
||||
|
@ -74,6 +74,7 @@ import static org.elasticsearch.cluster.coordination.Coordinator.Mode.LEADER;
|
||||
import static org.elasticsearch.cluster.coordination.Coordinator.PUBLISH_TIMEOUT_SETTING;
|
||||
import static org.elasticsearch.cluster.coordination.CoordinatorTests.Cluster.DEFAULT_DELAY_VARIABILITY;
|
||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
|
||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_DURATION_SETTING;
|
||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
|
||||
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_INTERVAL_SETTING;
|
||||
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING;
|
||||
@ -490,6 +491,7 @@ public class CoordinatorTests extends ESTestCase {
|
||||
// Then wait for an election to be scheduled; we allow enough time for retries to allow for collisions
|
||||
+ defaultMillis(ELECTION_INITIAL_TIMEOUT_SETTING) * ELECTION_RETRIES
|
||||
+ defaultMillis(ELECTION_BACK_OFF_TIME_SETTING) * ELECTION_RETRIES * (ELECTION_RETRIES - 1) / 2
|
||||
+ defaultMillis(ELECTION_DURATION_SETTING) * ELECTION_RETRIES
|
||||
// Allow two round-trip for pre-voting and voting
|
||||
+ 4 * DEFAULT_DELAY_VARIABILITY
|
||||
// Then a commit of the new leader's first cluster state
|
||||
|
@ -28,6 +28,7 @@ import org.elasticsearch.test.ESTestCase;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
|
||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_DURATION_SETTING;
|
||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
|
||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING;
|
||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.toPositiveLongAtMost;
|
||||
@ -48,7 +49,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
||||
|
||||
private void assertElectionSchedule(final DeterministicTaskQueue deterministicTaskQueue,
|
||||
final ElectionSchedulerFactory electionSchedulerFactory,
|
||||
final long initialTimeout, final long backOffTime, final long maxTimeout) {
|
||||
final long initialTimeout, final long backOffTime, final long maxTimeout, final long duration) {
|
||||
|
||||
final TimeValue initialGracePeriod = randomGracePeriod();
|
||||
final AtomicBoolean electionStarted = new AtomicBoolean();
|
||||
@ -56,7 +57,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
||||
try (Releasable ignored = electionSchedulerFactory.startElectionScheduler(initialGracePeriod,
|
||||
() -> assertTrue(electionStarted.compareAndSet(false, true)))) {
|
||||
|
||||
long lastElectionTime = deterministicTaskQueue.getCurrentTimeMillis();
|
||||
long lastElectionFinishTime = deterministicTaskQueue.getCurrentTimeMillis();
|
||||
int electionCount = 0;
|
||||
|
||||
while (true) {
|
||||
@ -70,10 +71,10 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
||||
}
|
||||
assertTrue(electionStarted.compareAndSet(true, false));
|
||||
|
||||
final long thisElectionTime = deterministicTaskQueue.getCurrentTimeMillis();
|
||||
final long thisElectionStartTime = deterministicTaskQueue.getCurrentTimeMillis();
|
||||
|
||||
if (electionCount == 1) {
|
||||
final long electionDelay = thisElectionTime - lastElectionTime;
|
||||
final long electionDelay = thisElectionStartTime - lastElectionFinishTime;
|
||||
|
||||
// Check grace period
|
||||
assertThat(electionDelay, greaterThanOrEqualTo(initialGracePeriod.millis()));
|
||||
@ -84,7 +85,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
||||
|
||||
} else {
|
||||
|
||||
final long electionDelay = thisElectionTime - lastElectionTime;
|
||||
final long electionDelay = thisElectionStartTime - lastElectionFinishTime;
|
||||
|
||||
// Check upper bound
|
||||
assertThat(electionDelay, lessThanOrEqualTo(initialTimeout + backOffTime * (electionCount - 1)));
|
||||
@ -98,7 +99,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
||||
}
|
||||
}
|
||||
|
||||
lastElectionTime = thisElectionTime;
|
||||
lastElectionFinishTime = thisElectionStartTime + duration;
|
||||
}
|
||||
}
|
||||
deterministicTaskQueue.runAllTasks();
|
||||
@ -125,19 +126,28 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
||||
randomLongBetween(Math.max(200, initialTimeoutMillis), 180000) + "ms");
|
||||
}
|
||||
|
||||
final long electionDurationMillis;
|
||||
if (randomBoolean()) {
|
||||
electionDurationMillis = randomLongBetween(1, 300000);
|
||||
settingsBuilder.put(ELECTION_DURATION_SETTING.getKey(), electionDurationMillis + "ms");
|
||||
} else {
|
||||
electionDurationMillis = ELECTION_DURATION_SETTING.get(Settings.EMPTY).millis();
|
||||
}
|
||||
|
||||
final Settings settings = settingsBuilder.put(NODE_NAME_SETTING.getKey(), "node").build();
|
||||
final long initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings).millis();
|
||||
final long backOffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings).millis();
|
||||
final long maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings).millis();
|
||||
final long duration = ELECTION_DURATION_SETTING.get(settings).millis();
|
||||
|
||||
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
|
||||
final ElectionSchedulerFactory electionSchedulerFactory
|
||||
= new ElectionSchedulerFactory(settings, random(), deterministicTaskQueue.getThreadPool());
|
||||
|
||||
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout);
|
||||
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout, duration);
|
||||
|
||||
// do it again to show that the max is reset when the scheduler is restarted
|
||||
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout);
|
||||
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout, duration);
|
||||
}
|
||||
|
||||
public void testSettingsValidation() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user