testCanNotPublishWithoutMinMastNodes could time out if disruption hit just before cluster was fully formed

The test uses a NetworkDelay that drops requests and slows down connecting. Next to that it disable node fault detection to make sure nodes are not removed before we check our publishing. Sadly that can lead to huge slow downs if the disruption hits while a node is still pinging (and tries to connect, which is slowed down). Instead we can start the disruption on the cluster state thread, making sure the result of fault detection won't be processed before we publish
This commit is contained in:
Boaz Leskes 2016-09-15 00:22:07 +02:00
parent 8469c98e34
commit a5f03b4bc5
1 changed files with 2 additions and 3 deletions

View File

@ -31,7 +31,6 @@ import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.discovery.DiscoverySettings;
import org.elasticsearch.discovery.zen.ZenDiscovery;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.discovery.zen.fd.FaultDetection;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESIntegTestCase;
@ -364,7 +363,6 @@ public class MinimumMasterNodesIT extends ESIntegTestCase {
public void testCanNotPublishWithoutMinMastNodes() throws Exception {
Settings settings = Settings.builder()
.put("discovery.type", "zen")
.put(FaultDetection.PING_TIMEOUT_SETTING.getKey(), "1h") // disable it
.put(ZenDiscovery.PING_TIMEOUT_SETTING.getKey(), "200ms")
.put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey(), 2)
.put(DiscoverySettings.COMMIT_TIMEOUT_SETTING.getKey(), "100ms") // speed things up
@ -379,7 +377,6 @@ public class MinimumMasterNodesIT extends ESIntegTestCase {
new TwoPartitions(Collections.singleton(master), otherNodes),
new NetworkDelay(TimeValue.timeValueMinutes(1)));
internalCluster().setDisruptionScheme(partition);
partition.startDisrupting();
final CountDownLatch latch = new CountDownLatch(1);
final AtomicReference<Exception> failure = new AtomicReference<>();
@ -393,6 +390,8 @@ public class MinimumMasterNodesIT extends ESIntegTestCase {
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
logger.debug("--> starting the disruption, preventing cluster state publishing");
partition.startDisrupting();
MetaData.Builder metaData = MetaData.builder(currentState.metaData()).persistentSettings(
Settings.builder().put(currentState.metaData().persistentSettings()).put("_SHOULD_NOT_BE_THERE_", true).build()
);