Account for election duration (#34362)

Today we may schedule two elections very close together, which can cause the
first election to fail even if there are no other nodes. This change adds a
delay in between subsequent elections on the same node, effectively allowing
time for each election to complete before scheduling the next one.
This commit is contained in:
David Turner 2018-10-11 15:31:08 +01:00 committed by GitHub
parent 52a3a19551
commit a32e303b0c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 16 deletions

View File

@ -48,6 +48,7 @@ public class ElectionSchedulerFactory extends AbstractComponent {
private static final String ELECTION_INITIAL_TIMEOUT_SETTING_KEY = "cluster.election.initial_timeout"; private static final String ELECTION_INITIAL_TIMEOUT_SETTING_KEY = "cluster.election.initial_timeout";
private static final String ELECTION_BACK_OFF_TIME_SETTING_KEY = "cluster.election.back_off_time"; private static final String ELECTION_BACK_OFF_TIME_SETTING_KEY = "cluster.election.back_off_time";
private static final String ELECTION_MAX_TIMEOUT_SETTING_KEY = "cluster.election.max_timeout"; private static final String ELECTION_MAX_TIMEOUT_SETTING_KEY = "cluster.election.max_timeout";
private static final String ELECTION_DURATION_SETTING_KEY = "cluster.election.duration";
/* /*
* The first election is scheduled to occur a random number of milliseconds after the scheduler is started, where the random number of * The first election is scheduled to occur a random number of milliseconds after the scheduler is started, where the random number of
@ -59,6 +60,8 @@ public class ElectionSchedulerFactory extends AbstractComponent {
* number of milliseconds is chosen uniformly from * number of milliseconds is chosen uniformly from
* *
* (0, min(ELECTION_INITIAL_TIMEOUT_SETTING + (n-1) * ELECTION_BACK_OFF_TIME_SETTING, ELECTION_MAX_TIMEOUT_SETTING)] * (0, min(ELECTION_INITIAL_TIMEOUT_SETTING + (n-1) * ELECTION_BACK_OFF_TIME_SETTING, ELECTION_MAX_TIMEOUT_SETTING)]
*
* Each election lasts up to ELECTION_DURATION_SETTING.
*/ */
public static final Setting<TimeValue> ELECTION_INITIAL_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_INITIAL_TIMEOUT_SETTING_KEY, public static final Setting<TimeValue> ELECTION_INITIAL_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_INITIAL_TIMEOUT_SETTING_KEY,
@ -70,9 +73,13 @@ public class ElectionSchedulerFactory extends AbstractComponent {
public static final Setting<TimeValue> ELECTION_MAX_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_MAX_TIMEOUT_SETTING_KEY, public static final Setting<TimeValue> ELECTION_MAX_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_MAX_TIMEOUT_SETTING_KEY,
TimeValue.timeValueSeconds(10), TimeValue.timeValueMillis(200), TimeValue.timeValueSeconds(300), Property.NodeScope); TimeValue.timeValueSeconds(10), TimeValue.timeValueMillis(200), TimeValue.timeValueSeconds(300), Property.NodeScope);
public static final Setting<TimeValue> ELECTION_DURATION_SETTING = Setting.timeSetting(ELECTION_DURATION_SETTING_KEY,
TimeValue.timeValueMillis(500), TimeValue.timeValueMillis(1), TimeValue.timeValueSeconds(300), Property.NodeScope);
private final TimeValue initialTimeout; private final TimeValue initialTimeout;
private final TimeValue backoffTime; private final TimeValue backoffTime;
private final TimeValue maxTimeout; private final TimeValue maxTimeout;
private final TimeValue duration;
private final ThreadPool threadPool; private final ThreadPool threadPool;
private final Random random; private final Random random;
@ -85,6 +92,7 @@ public class ElectionSchedulerFactory extends AbstractComponent {
initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings); initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings);
backoffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings); backoffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings);
maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings); maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings);
duration = ELECTION_DURATION_SETTING.get(settings);
if (maxTimeout.millis() < initialTimeout.millis()) { if (maxTimeout.millis() < initialTimeout.millis()) {
throw new IllegalArgumentException(new ParameterizedMessage("[{}] is [{}], but must be at least [{}] which is [{}]", throw new IllegalArgumentException(new ParameterizedMessage("[{}] is [{}], but must be at least [{}] which is [{}]",
@ -154,15 +162,11 @@ public class ElectionSchedulerFactory extends AbstractComponent {
protected void doRun() { protected void doRun() {
if (isClosed.get()) { if (isClosed.get()) {
logger.debug("{} not starting election", this); logger.debug("{} not starting election", this);
return; } else {
logger.debug("{} starting election", this);
scheduleNextElection(duration, scheduledRunnable);
scheduledRunnable.run();
} }
logger.debug("{} starting election", this);
scheduledRunnable.run();
}
@Override
public void onAfter() {
scheduleNextElection(TimeValue.ZERO, scheduledRunnable);
} }
@Override @Override

View File

@ -450,6 +450,7 @@ public final class ClusterSettings extends AbstractScopedSettings {
ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING, ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING,
ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING, ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING,
ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING, ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING,
ElectionSchedulerFactory.ELECTION_DURATION_SETTING,
Coordinator.PUBLISH_TIMEOUT_SETTING, Coordinator.PUBLISH_TIMEOUT_SETTING,
JoinHelper.JOIN_TIMEOUT_SETTING JoinHelper.JOIN_TIMEOUT_SETTING
))); )));

View File

@ -74,6 +74,7 @@ import static org.elasticsearch.cluster.coordination.Coordinator.Mode.LEADER;
import static org.elasticsearch.cluster.coordination.Coordinator.PUBLISH_TIMEOUT_SETTING; import static org.elasticsearch.cluster.coordination.Coordinator.PUBLISH_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.CoordinatorTests.Cluster.DEFAULT_DELAY_VARIABILITY; import static org.elasticsearch.cluster.coordination.CoordinatorTests.Cluster.DEFAULT_DELAY_VARIABILITY;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING; import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_DURATION_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING; import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_INTERVAL_SETTING; import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_INTERVAL_SETTING;
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING; import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING;
@ -490,6 +491,7 @@ public class CoordinatorTests extends ESTestCase {
// Then wait for an election to be scheduled; we allow enough time for retries to allow for collisions // Then wait for an election to be scheduled; we allow enough time for retries to allow for collisions
+ defaultMillis(ELECTION_INITIAL_TIMEOUT_SETTING) * ELECTION_RETRIES + defaultMillis(ELECTION_INITIAL_TIMEOUT_SETTING) * ELECTION_RETRIES
+ defaultMillis(ELECTION_BACK_OFF_TIME_SETTING) * ELECTION_RETRIES * (ELECTION_RETRIES - 1) / 2 + defaultMillis(ELECTION_BACK_OFF_TIME_SETTING) * ELECTION_RETRIES * (ELECTION_RETRIES - 1) / 2
+ defaultMillis(ELECTION_DURATION_SETTING) * ELECTION_RETRIES
// Allow two round-trip for pre-voting and voting // Allow two round-trip for pre-voting and voting
+ 4 * DEFAULT_DELAY_VARIABILITY + 4 * DEFAULT_DELAY_VARIABILITY
// Then a commit of the new leader's first cluster state // Then a commit of the new leader's first cluster state

View File

@ -28,6 +28,7 @@ import org.elasticsearch.test.ESTestCase;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING; import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_DURATION_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING; import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING; import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING;
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.toPositiveLongAtMost; import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.toPositiveLongAtMost;
@ -48,7 +49,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
private void assertElectionSchedule(final DeterministicTaskQueue deterministicTaskQueue, private void assertElectionSchedule(final DeterministicTaskQueue deterministicTaskQueue,
final ElectionSchedulerFactory electionSchedulerFactory, final ElectionSchedulerFactory electionSchedulerFactory,
final long initialTimeout, final long backOffTime, final long maxTimeout) { final long initialTimeout, final long backOffTime, final long maxTimeout, final long duration) {
final TimeValue initialGracePeriod = randomGracePeriod(); final TimeValue initialGracePeriod = randomGracePeriod();
final AtomicBoolean electionStarted = new AtomicBoolean(); final AtomicBoolean electionStarted = new AtomicBoolean();
@ -56,7 +57,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
try (Releasable ignored = electionSchedulerFactory.startElectionScheduler(initialGracePeriod, try (Releasable ignored = electionSchedulerFactory.startElectionScheduler(initialGracePeriod,
() -> assertTrue(electionStarted.compareAndSet(false, true)))) { () -> assertTrue(electionStarted.compareAndSet(false, true)))) {
long lastElectionTime = deterministicTaskQueue.getCurrentTimeMillis(); long lastElectionFinishTime = deterministicTaskQueue.getCurrentTimeMillis();
int electionCount = 0; int electionCount = 0;
while (true) { while (true) {
@ -70,10 +71,10 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
} }
assertTrue(electionStarted.compareAndSet(true, false)); assertTrue(electionStarted.compareAndSet(true, false));
final long thisElectionTime = deterministicTaskQueue.getCurrentTimeMillis(); final long thisElectionStartTime = deterministicTaskQueue.getCurrentTimeMillis();
if (electionCount == 1) { if (electionCount == 1) {
final long electionDelay = thisElectionTime - lastElectionTime; final long electionDelay = thisElectionStartTime - lastElectionFinishTime;
// Check grace period // Check grace period
assertThat(electionDelay, greaterThanOrEqualTo(initialGracePeriod.millis())); assertThat(electionDelay, greaterThanOrEqualTo(initialGracePeriod.millis()));
@ -84,7 +85,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
} else { } else {
final long electionDelay = thisElectionTime - lastElectionTime; final long electionDelay = thisElectionStartTime - lastElectionFinishTime;
// Check upper bound // Check upper bound
assertThat(electionDelay, lessThanOrEqualTo(initialTimeout + backOffTime * (electionCount - 1))); assertThat(electionDelay, lessThanOrEqualTo(initialTimeout + backOffTime * (electionCount - 1)));
@ -98,7 +99,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
} }
} }
lastElectionTime = thisElectionTime; lastElectionFinishTime = thisElectionStartTime + duration;
} }
} }
deterministicTaskQueue.runAllTasks(); deterministicTaskQueue.runAllTasks();
@ -125,19 +126,28 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
randomLongBetween(Math.max(200, initialTimeoutMillis), 180000) + "ms"); randomLongBetween(Math.max(200, initialTimeoutMillis), 180000) + "ms");
} }
final long electionDurationMillis;
if (randomBoolean()) {
electionDurationMillis = randomLongBetween(1, 300000);
settingsBuilder.put(ELECTION_DURATION_SETTING.getKey(), electionDurationMillis + "ms");
} else {
electionDurationMillis = ELECTION_DURATION_SETTING.get(Settings.EMPTY).millis();
}
final Settings settings = settingsBuilder.put(NODE_NAME_SETTING.getKey(), "node").build(); final Settings settings = settingsBuilder.put(NODE_NAME_SETTING.getKey(), "node").build();
final long initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings).millis(); final long initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings).millis();
final long backOffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings).millis(); final long backOffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings).millis();
final long maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings).millis(); final long maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings).millis();
final long duration = ELECTION_DURATION_SETTING.get(settings).millis();
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random()); final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
final ElectionSchedulerFactory electionSchedulerFactory final ElectionSchedulerFactory electionSchedulerFactory
= new ElectionSchedulerFactory(settings, random(), deterministicTaskQueue.getThreadPool()); = new ElectionSchedulerFactory(settings, random(), deterministicTaskQueue.getThreadPool());
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout); assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout, duration);
// do it again to show that the max is reset when the scheduler is restarted // do it again to show that the max is reset when the scheduler is restarted
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout); assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout, duration);
} }
public void testSettingsValidation() { public void testSettingsValidation() {