Increase disruption test publish timeout to 5s (#51803)

With the new mechanism for storing cluster state in lucene, we store
index metadata in multiple data paths too. This causes cluster state
publish to timeout too frequently with a 1s timeout, so increasing it to
5s. Also increasing follower check timeout to 5s since it also sometimes
has fsync in its timeout path and leader check for symmetry.

Closes #51329
This commit is contained in:
Henning Andersen 2020-02-03 13:57:18 +01:00 committed by Henning Andersen
parent 81388051d8
commit 918dfaff1f
3 changed files with 5 additions and 9 deletions

View File

@ -123,12 +123,12 @@ public abstract class AbstractDisruptionTestCase extends ESIntegTestCase {
} }
static final Settings DEFAULT_SETTINGS = Settings.builder() static final Settings DEFAULT_SETTINGS = Settings.builder()
.put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s") // for hitting simulated network failures quickly .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "5s") // for hitting simulated network failures quickly
.put(LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly .put(LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly
.put(FollowersChecker.FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), "1s") // for hitting simulated network failures quickly .put(FollowersChecker.FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), "5s") // for hitting simulated network failures quickly
.put(FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly .put(FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly
.put(JoinHelper.JOIN_TIMEOUT_SETTING.getKey(), "10s") // still long to induce failures but to long so test won't time out .put(JoinHelper.JOIN_TIMEOUT_SETTING.getKey(), "10s") // still long to induce failures but to long so test won't time out
.put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s") // <-- for hitting simulated network failures quickly .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "5s") // <-- for hitting simulated network failures quickly
.put(TransportSettings.CONNECT_TIMEOUT.getKey(), "10s") // Network delay disruption waits for the min between this .put(TransportSettings.CONNECT_TIMEOUT.getKey(), "10s") // Network delay disruption waits for the min between this
// value and the time of disruption and does not recover immediately // value and the time of disruption and does not recover immediately
// when disruption is stop. We should make sure we recover faster // when disruption is stop. We should make sure we recover faster

View File

@ -107,9 +107,8 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
@TestIssueLogging(value = "_root:DEBUG,org.elasticsearch.action.bulk:TRACE,org.elasticsearch.action.get:TRACE," + @TestIssueLogging(value = "_root:DEBUG,org.elasticsearch.action.bulk:TRACE,org.elasticsearch.action.get:TRACE," +
"org.elasticsearch.discovery:TRACE,org.elasticsearch.action.support.replication:TRACE," + "org.elasticsearch.discovery:TRACE,org.elasticsearch.action.support.replication:TRACE," +
"org.elasticsearch.cluster.service:TRACE,org.elasticsearch.indices.recovery:TRACE," + "org.elasticsearch.cluster.service:TRACE,org.elasticsearch.indices.recovery:TRACE," +
"org.elasticsearch.indices.cluster:TRACE,org.elasticsearch.index.shard:TRACE," + "org.elasticsearch.indices.cluster:TRACE,org.elasticsearch.index.shard:TRACE",
"org.elasticsearch.gateway.PersistedClusterStateService:TRACE", issueUrl = "https://github.com/elastic/elasticsearch/issues/41068")
issueUrl = "https://github.com/elastic/elasticsearch/issues/41068,https://github.com/elastic/elasticsearch/issues/51329")
public void testAckedIndexing() throws Exception { public void testAckedIndexing() throws Exception {
final int seconds = !(TEST_NIGHTLY && rarely()) ? 1 : 5; final int seconds = !(TEST_NIGHTLY && rarely()) ? 1 : 5;

View File

@ -38,7 +38,6 @@ import org.elasticsearch.discovery.AbstractDisruptionTestCase;
import org.elasticsearch.index.engine.VersionConflictEngineException; import org.elasticsearch.index.engine.VersionConflictEngineException;
import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.disruption.ServiceDisruptionScheme; import org.elasticsearch.test.disruption.ServiceDisruptionScheme;
import org.elasticsearch.test.junit.annotations.TestIssueLogging;
import org.elasticsearch.threadpool.Scheduler; import org.elasticsearch.threadpool.Scheduler;
import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.threadpool.ThreadPool;
@ -132,8 +131,6 @@ public class ConcurrentSeqNoVersioningIT extends AbstractDisruptionTestCase {
// multiple threads doing CAS updates. // multiple threads doing CAS updates.
// Wait up to 1 minute (+10s in thread to ensure it does not time out) for threads to complete previous round before initiating next // Wait up to 1 minute (+10s in thread to ensure it does not time out) for threads to complete previous round before initiating next
// round. // round.
@TestIssueLogging(value = "org.elasticsearch.gateway.PersistedClusterStateService:TRACE",
issueUrl = "https://github.com/elastic/elasticsearch/issues/51329")
public void testSeqNoCASLinearizability() { public void testSeqNoCASLinearizability() {
final int disruptTimeSeconds = scaledRandomIntBetween(1, 8); final int disruptTimeSeconds = scaledRandomIntBetween(1, 8);