Account for election duration (#34362)

Today we may schedule two elections very close together, which can cause the
first election to fail even if there are no other nodes. This change adds a
delay in between subsequent elections on the same node, effectively allowing
time for each election to complete before scheduling the next one.
This commit is contained in:
David Turner 2018-10-11 15:31:08 +01:00 committed by GitHub
parent 52a3a19551
commit a32e303b0c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 16 deletions

View File

@ -48,6 +48,7 @@ public class ElectionSchedulerFactory extends AbstractComponent {
private static final String ELECTION_INITIAL_TIMEOUT_SETTING_KEY = "cluster.election.initial_timeout";
private static final String ELECTION_BACK_OFF_TIME_SETTING_KEY = "cluster.election.back_off_time";
private static final String ELECTION_MAX_TIMEOUT_SETTING_KEY = "cluster.election.max_timeout";
private static final String ELECTION_DURATION_SETTING_KEY = "cluster.election.duration";
/*
* The first election is scheduled to occur a random number of milliseconds after the scheduler is started, where the random number of
@ -59,6 +60,8 @@ public class ElectionSchedulerFactory extends AbstractComponent {
* number of milliseconds is chosen uniformly from
*
* (0, min(ELECTION_INITIAL_TIMEOUT_SETTING + (n-1) * ELECTION_BACK_OFF_TIME_SETTING, ELECTION_MAX_TIMEOUT_SETTING)]
*
* Each election lasts up to ELECTION_DURATION_SETTING.
*/
public static final Setting<TimeValue> ELECTION_INITIAL_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_INITIAL_TIMEOUT_SETTING_KEY,
@ -70,9 +73,13 @@ public class ElectionSchedulerFactory extends AbstractComponent {
public static final Setting<TimeValue> ELECTION_MAX_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_MAX_TIMEOUT_SETTING_KEY,
TimeValue.timeValueSeconds(10), TimeValue.timeValueMillis(200), TimeValue.timeValueSeconds(300), Property.NodeScope);
public static final Setting<TimeValue> ELECTION_DURATION_SETTING = Setting.timeSetting(ELECTION_DURATION_SETTING_KEY,
TimeValue.timeValueMillis(500), TimeValue.timeValueMillis(1), TimeValue.timeValueSeconds(300), Property.NodeScope);
private final TimeValue initialTimeout;
private final TimeValue backoffTime;
private final TimeValue maxTimeout;
private final TimeValue duration;
private final ThreadPool threadPool;
private final Random random;
@ -85,6 +92,7 @@ public class ElectionSchedulerFactory extends AbstractComponent {
initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings);
backoffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings);
maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings);
duration = ELECTION_DURATION_SETTING.get(settings);
if (maxTimeout.millis() < initialTimeout.millis()) {
throw new IllegalArgumentException(new ParameterizedMessage("[{}] is [{}], but must be at least [{}] which is [{}]",
@ -154,15 +162,11 @@ public class ElectionSchedulerFactory extends AbstractComponent {
protected void doRun() {
if (isClosed.get()) {
logger.debug("{} not starting election", this);
return;
}
} else {
logger.debug("{} starting election", this);
scheduleNextElection(duration, scheduledRunnable);
scheduledRunnable.run();
}
@Override
public void onAfter() {
scheduleNextElection(TimeValue.ZERO, scheduledRunnable);
}
@Override

View File

@ -450,6 +450,7 @@ public final class ClusterSettings extends AbstractScopedSettings {
ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING,
ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING,
ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING,
ElectionSchedulerFactory.ELECTION_DURATION_SETTING,
Coordinator.PUBLISH_TIMEOUT_SETTING,
JoinHelper.JOIN_TIMEOUT_SETTING
)));

View File

@ -74,6 +74,7 @@ import static org.elasticsearch.cluster.coordination.Coordinator.Mode.LEADER;
import static org.elasticsearch.cluster.coordination.Coordinator.PUBLISH_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.CoordinatorTests.Cluster.DEFAULT_DELAY_VARIABILITY;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_DURATION_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_INTERVAL_SETTING;
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING;
@ -490,6 +491,7 @@ public class CoordinatorTests extends ESTestCase {
// Then wait for an election to be scheduled; we allow enough time for retries to allow for collisions
+ defaultMillis(ELECTION_INITIAL_TIMEOUT_SETTING) * ELECTION_RETRIES
+ defaultMillis(ELECTION_BACK_OFF_TIME_SETTING) * ELECTION_RETRIES * (ELECTION_RETRIES - 1) / 2
+ defaultMillis(ELECTION_DURATION_SETTING) * ELECTION_RETRIES
// Allow two round-trip for pre-voting and voting
+ 4 * DEFAULT_DELAY_VARIABILITY
// Then a commit of the new leader's first cluster state

View File

@ -28,6 +28,7 @@ import org.elasticsearch.test.ESTestCase;
import java.util.concurrent.atomic.AtomicBoolean;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_DURATION_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.toPositiveLongAtMost;
@ -48,7 +49,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
private void assertElectionSchedule(final DeterministicTaskQueue deterministicTaskQueue,
final ElectionSchedulerFactory electionSchedulerFactory,
final long initialTimeout, final long backOffTime, final long maxTimeout) {
final long initialTimeout, final long backOffTime, final long maxTimeout, final long duration) {
final TimeValue initialGracePeriod = randomGracePeriod();
final AtomicBoolean electionStarted = new AtomicBoolean();
@ -56,7 +57,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
try (Releasable ignored = electionSchedulerFactory.startElectionScheduler(initialGracePeriod,
() -> assertTrue(electionStarted.compareAndSet(false, true)))) {
long lastElectionTime = deterministicTaskQueue.getCurrentTimeMillis();
long lastElectionFinishTime = deterministicTaskQueue.getCurrentTimeMillis();
int electionCount = 0;
while (true) {
@ -70,10 +71,10 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
}
assertTrue(electionStarted.compareAndSet(true, false));
final long thisElectionTime = deterministicTaskQueue.getCurrentTimeMillis();
final long thisElectionStartTime = deterministicTaskQueue.getCurrentTimeMillis();
if (electionCount == 1) {
final long electionDelay = thisElectionTime - lastElectionTime;
final long electionDelay = thisElectionStartTime - lastElectionFinishTime;
// Check grace period
assertThat(electionDelay, greaterThanOrEqualTo(initialGracePeriod.millis()));
@ -84,7 +85,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
} else {
final long electionDelay = thisElectionTime - lastElectionTime;
final long electionDelay = thisElectionStartTime - lastElectionFinishTime;
// Check upper bound
assertThat(electionDelay, lessThanOrEqualTo(initialTimeout + backOffTime * (electionCount - 1)));
@ -98,7 +99,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
}
}
lastElectionTime = thisElectionTime;
lastElectionFinishTime = thisElectionStartTime + duration;
}
}
deterministicTaskQueue.runAllTasks();
@ -125,19 +126,28 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
randomLongBetween(Math.max(200, initialTimeoutMillis), 180000) + "ms");
}
final long electionDurationMillis;
if (randomBoolean()) {
electionDurationMillis = randomLongBetween(1, 300000);
settingsBuilder.put(ELECTION_DURATION_SETTING.getKey(), electionDurationMillis + "ms");
} else {
electionDurationMillis = ELECTION_DURATION_SETTING.get(Settings.EMPTY).millis();
}
final Settings settings = settingsBuilder.put(NODE_NAME_SETTING.getKey(), "node").build();
final long initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings).millis();
final long backOffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings).millis();
final long maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings).millis();
final long duration = ELECTION_DURATION_SETTING.get(settings).millis();
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
final ElectionSchedulerFactory electionSchedulerFactory
= new ElectionSchedulerFactory(settings, random(), deterministicTaskQueue.getThreadPool());
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout);
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout, duration);
// do it again to show that the max is reset when the scheduler is restarted
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout);
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout, duration);
}
public void testSettingsValidation() {