mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-15 17:35:41 +00:00
Account for election duration (#34362)
Today we may schedule two elections very close together, which can cause the first election to fail even if there are no other nodes. This change adds a delay in between subsequent elections on the same node, effectively allowing time for each election to complete before scheduling the next one.
This commit is contained in:
parent
52a3a19551
commit
a32e303b0c
@ -48,6 +48,7 @@ public class ElectionSchedulerFactory extends AbstractComponent {
|
|||||||
private static final String ELECTION_INITIAL_TIMEOUT_SETTING_KEY = "cluster.election.initial_timeout";
|
private static final String ELECTION_INITIAL_TIMEOUT_SETTING_KEY = "cluster.election.initial_timeout";
|
||||||
private static final String ELECTION_BACK_OFF_TIME_SETTING_KEY = "cluster.election.back_off_time";
|
private static final String ELECTION_BACK_OFF_TIME_SETTING_KEY = "cluster.election.back_off_time";
|
||||||
private static final String ELECTION_MAX_TIMEOUT_SETTING_KEY = "cluster.election.max_timeout";
|
private static final String ELECTION_MAX_TIMEOUT_SETTING_KEY = "cluster.election.max_timeout";
|
||||||
|
private static final String ELECTION_DURATION_SETTING_KEY = "cluster.election.duration";
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The first election is scheduled to occur a random number of milliseconds after the scheduler is started, where the random number of
|
* The first election is scheduled to occur a random number of milliseconds after the scheduler is started, where the random number of
|
||||||
@ -59,6 +60,8 @@ public class ElectionSchedulerFactory extends AbstractComponent {
|
|||||||
* number of milliseconds is chosen uniformly from
|
* number of milliseconds is chosen uniformly from
|
||||||
*
|
*
|
||||||
* (0, min(ELECTION_INITIAL_TIMEOUT_SETTING + (n-1) * ELECTION_BACK_OFF_TIME_SETTING, ELECTION_MAX_TIMEOUT_SETTING)]
|
* (0, min(ELECTION_INITIAL_TIMEOUT_SETTING + (n-1) * ELECTION_BACK_OFF_TIME_SETTING, ELECTION_MAX_TIMEOUT_SETTING)]
|
||||||
|
*
|
||||||
|
* Each election lasts up to ELECTION_DURATION_SETTING.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public static final Setting<TimeValue> ELECTION_INITIAL_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_INITIAL_TIMEOUT_SETTING_KEY,
|
public static final Setting<TimeValue> ELECTION_INITIAL_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_INITIAL_TIMEOUT_SETTING_KEY,
|
||||||
@ -70,9 +73,13 @@ public class ElectionSchedulerFactory extends AbstractComponent {
|
|||||||
public static final Setting<TimeValue> ELECTION_MAX_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_MAX_TIMEOUT_SETTING_KEY,
|
public static final Setting<TimeValue> ELECTION_MAX_TIMEOUT_SETTING = Setting.timeSetting(ELECTION_MAX_TIMEOUT_SETTING_KEY,
|
||||||
TimeValue.timeValueSeconds(10), TimeValue.timeValueMillis(200), TimeValue.timeValueSeconds(300), Property.NodeScope);
|
TimeValue.timeValueSeconds(10), TimeValue.timeValueMillis(200), TimeValue.timeValueSeconds(300), Property.NodeScope);
|
||||||
|
|
||||||
|
public static final Setting<TimeValue> ELECTION_DURATION_SETTING = Setting.timeSetting(ELECTION_DURATION_SETTING_KEY,
|
||||||
|
TimeValue.timeValueMillis(500), TimeValue.timeValueMillis(1), TimeValue.timeValueSeconds(300), Property.NodeScope);
|
||||||
|
|
||||||
private final TimeValue initialTimeout;
|
private final TimeValue initialTimeout;
|
||||||
private final TimeValue backoffTime;
|
private final TimeValue backoffTime;
|
||||||
private final TimeValue maxTimeout;
|
private final TimeValue maxTimeout;
|
||||||
|
private final TimeValue duration;
|
||||||
private final ThreadPool threadPool;
|
private final ThreadPool threadPool;
|
||||||
private final Random random;
|
private final Random random;
|
||||||
|
|
||||||
@ -85,6 +92,7 @@ public class ElectionSchedulerFactory extends AbstractComponent {
|
|||||||
initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings);
|
initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings);
|
||||||
backoffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings);
|
backoffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings);
|
||||||
maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings);
|
maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings);
|
||||||
|
duration = ELECTION_DURATION_SETTING.get(settings);
|
||||||
|
|
||||||
if (maxTimeout.millis() < initialTimeout.millis()) {
|
if (maxTimeout.millis() < initialTimeout.millis()) {
|
||||||
throw new IllegalArgumentException(new ParameterizedMessage("[{}] is [{}], but must be at least [{}] which is [{}]",
|
throw new IllegalArgumentException(new ParameterizedMessage("[{}] is [{}], but must be at least [{}] which is [{}]",
|
||||||
@ -154,15 +162,11 @@ public class ElectionSchedulerFactory extends AbstractComponent {
|
|||||||
protected void doRun() {
|
protected void doRun() {
|
||||||
if (isClosed.get()) {
|
if (isClosed.get()) {
|
||||||
logger.debug("{} not starting election", this);
|
logger.debug("{} not starting election", this);
|
||||||
return;
|
} else {
|
||||||
|
logger.debug("{} starting election", this);
|
||||||
|
scheduleNextElection(duration, scheduledRunnable);
|
||||||
|
scheduledRunnable.run();
|
||||||
}
|
}
|
||||||
logger.debug("{} starting election", this);
|
|
||||||
scheduledRunnable.run();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void onAfter() {
|
|
||||||
scheduleNextElection(TimeValue.ZERO, scheduledRunnable);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -450,6 +450,7 @@ public final class ClusterSettings extends AbstractScopedSettings {
|
|||||||
ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING,
|
ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING,
|
||||||
ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING,
|
ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING,
|
||||||
ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING,
|
ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING,
|
||||||
|
ElectionSchedulerFactory.ELECTION_DURATION_SETTING,
|
||||||
Coordinator.PUBLISH_TIMEOUT_SETTING,
|
Coordinator.PUBLISH_TIMEOUT_SETTING,
|
||||||
JoinHelper.JOIN_TIMEOUT_SETTING
|
JoinHelper.JOIN_TIMEOUT_SETTING
|
||||||
)));
|
)));
|
||||||
|
@ -74,6 +74,7 @@ import static org.elasticsearch.cluster.coordination.Coordinator.Mode.LEADER;
|
|||||||
import static org.elasticsearch.cluster.coordination.Coordinator.PUBLISH_TIMEOUT_SETTING;
|
import static org.elasticsearch.cluster.coordination.Coordinator.PUBLISH_TIMEOUT_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.CoordinatorTests.Cluster.DEFAULT_DELAY_VARIABILITY;
|
import static org.elasticsearch.cluster.coordination.CoordinatorTests.Cluster.DEFAULT_DELAY_VARIABILITY;
|
||||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
|
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
|
||||||
|
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_DURATION_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
|
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_INTERVAL_SETTING;
|
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_INTERVAL_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING;
|
import static org.elasticsearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING;
|
||||||
@ -490,6 +491,7 @@ public class CoordinatorTests extends ESTestCase {
|
|||||||
// Then wait for an election to be scheduled; we allow enough time for retries to allow for collisions
|
// Then wait for an election to be scheduled; we allow enough time for retries to allow for collisions
|
||||||
+ defaultMillis(ELECTION_INITIAL_TIMEOUT_SETTING) * ELECTION_RETRIES
|
+ defaultMillis(ELECTION_INITIAL_TIMEOUT_SETTING) * ELECTION_RETRIES
|
||||||
+ defaultMillis(ELECTION_BACK_OFF_TIME_SETTING) * ELECTION_RETRIES * (ELECTION_RETRIES - 1) / 2
|
+ defaultMillis(ELECTION_BACK_OFF_TIME_SETTING) * ELECTION_RETRIES * (ELECTION_RETRIES - 1) / 2
|
||||||
|
+ defaultMillis(ELECTION_DURATION_SETTING) * ELECTION_RETRIES
|
||||||
// Allow two round-trip for pre-voting and voting
|
// Allow two round-trip for pre-voting and voting
|
||||||
+ 4 * DEFAULT_DELAY_VARIABILITY
|
+ 4 * DEFAULT_DELAY_VARIABILITY
|
||||||
// Then a commit of the new leader's first cluster state
|
// Then a commit of the new leader's first cluster state
|
||||||
|
@ -28,6 +28,7 @@ import org.elasticsearch.test.ESTestCase;
|
|||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
|
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_BACK_OFF_TIME_SETTING;
|
||||||
|
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_DURATION_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
|
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_INITIAL_TIMEOUT_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING;
|
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.ELECTION_MAX_TIMEOUT_SETTING;
|
||||||
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.toPositiveLongAtMost;
|
import static org.elasticsearch.cluster.coordination.ElectionSchedulerFactory.toPositiveLongAtMost;
|
||||||
@ -48,7 +49,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
|||||||
|
|
||||||
private void assertElectionSchedule(final DeterministicTaskQueue deterministicTaskQueue,
|
private void assertElectionSchedule(final DeterministicTaskQueue deterministicTaskQueue,
|
||||||
final ElectionSchedulerFactory electionSchedulerFactory,
|
final ElectionSchedulerFactory electionSchedulerFactory,
|
||||||
final long initialTimeout, final long backOffTime, final long maxTimeout) {
|
final long initialTimeout, final long backOffTime, final long maxTimeout, final long duration) {
|
||||||
|
|
||||||
final TimeValue initialGracePeriod = randomGracePeriod();
|
final TimeValue initialGracePeriod = randomGracePeriod();
|
||||||
final AtomicBoolean electionStarted = new AtomicBoolean();
|
final AtomicBoolean electionStarted = new AtomicBoolean();
|
||||||
@ -56,7 +57,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
|||||||
try (Releasable ignored = electionSchedulerFactory.startElectionScheduler(initialGracePeriod,
|
try (Releasable ignored = electionSchedulerFactory.startElectionScheduler(initialGracePeriod,
|
||||||
() -> assertTrue(electionStarted.compareAndSet(false, true)))) {
|
() -> assertTrue(electionStarted.compareAndSet(false, true)))) {
|
||||||
|
|
||||||
long lastElectionTime = deterministicTaskQueue.getCurrentTimeMillis();
|
long lastElectionFinishTime = deterministicTaskQueue.getCurrentTimeMillis();
|
||||||
int electionCount = 0;
|
int electionCount = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
@ -70,10 +71,10 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
|||||||
}
|
}
|
||||||
assertTrue(electionStarted.compareAndSet(true, false));
|
assertTrue(electionStarted.compareAndSet(true, false));
|
||||||
|
|
||||||
final long thisElectionTime = deterministicTaskQueue.getCurrentTimeMillis();
|
final long thisElectionStartTime = deterministicTaskQueue.getCurrentTimeMillis();
|
||||||
|
|
||||||
if (electionCount == 1) {
|
if (electionCount == 1) {
|
||||||
final long electionDelay = thisElectionTime - lastElectionTime;
|
final long electionDelay = thisElectionStartTime - lastElectionFinishTime;
|
||||||
|
|
||||||
// Check grace period
|
// Check grace period
|
||||||
assertThat(electionDelay, greaterThanOrEqualTo(initialGracePeriod.millis()));
|
assertThat(electionDelay, greaterThanOrEqualTo(initialGracePeriod.millis()));
|
||||||
@ -84,7 +85,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
final long electionDelay = thisElectionTime - lastElectionTime;
|
final long electionDelay = thisElectionStartTime - lastElectionFinishTime;
|
||||||
|
|
||||||
// Check upper bound
|
// Check upper bound
|
||||||
assertThat(electionDelay, lessThanOrEqualTo(initialTimeout + backOffTime * (electionCount - 1)));
|
assertThat(electionDelay, lessThanOrEqualTo(initialTimeout + backOffTime * (electionCount - 1)));
|
||||||
@ -98,7 +99,7 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
lastElectionTime = thisElectionTime;
|
lastElectionFinishTime = thisElectionStartTime + duration;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
deterministicTaskQueue.runAllTasks();
|
deterministicTaskQueue.runAllTasks();
|
||||||
@ -125,19 +126,28 @@ public class ElectionSchedulerFactoryTests extends ESTestCase {
|
|||||||
randomLongBetween(Math.max(200, initialTimeoutMillis), 180000) + "ms");
|
randomLongBetween(Math.max(200, initialTimeoutMillis), 180000) + "ms");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final long electionDurationMillis;
|
||||||
|
if (randomBoolean()) {
|
||||||
|
electionDurationMillis = randomLongBetween(1, 300000);
|
||||||
|
settingsBuilder.put(ELECTION_DURATION_SETTING.getKey(), electionDurationMillis + "ms");
|
||||||
|
} else {
|
||||||
|
electionDurationMillis = ELECTION_DURATION_SETTING.get(Settings.EMPTY).millis();
|
||||||
|
}
|
||||||
|
|
||||||
final Settings settings = settingsBuilder.put(NODE_NAME_SETTING.getKey(), "node").build();
|
final Settings settings = settingsBuilder.put(NODE_NAME_SETTING.getKey(), "node").build();
|
||||||
final long initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings).millis();
|
final long initialTimeout = ELECTION_INITIAL_TIMEOUT_SETTING.get(settings).millis();
|
||||||
final long backOffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings).millis();
|
final long backOffTime = ELECTION_BACK_OFF_TIME_SETTING.get(settings).millis();
|
||||||
final long maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings).millis();
|
final long maxTimeout = ELECTION_MAX_TIMEOUT_SETTING.get(settings).millis();
|
||||||
|
final long duration = ELECTION_DURATION_SETTING.get(settings).millis();
|
||||||
|
|
||||||
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
|
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
|
||||||
final ElectionSchedulerFactory electionSchedulerFactory
|
final ElectionSchedulerFactory electionSchedulerFactory
|
||||||
= new ElectionSchedulerFactory(settings, random(), deterministicTaskQueue.getThreadPool());
|
= new ElectionSchedulerFactory(settings, random(), deterministicTaskQueue.getThreadPool());
|
||||||
|
|
||||||
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout);
|
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout, duration);
|
||||||
|
|
||||||
// do it again to show that the max is reset when the scheduler is restarted
|
// do it again to show that the max is reset when the scheduler is restarted
|
||||||
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout);
|
assertElectionSchedule(deterministicTaskQueue, electionSchedulerFactory, initialTimeout, backOffTime, maxTimeout, duration);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSettingsValidation() {
|
public void testSettingsValidation() {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user