diff --git a/server/src/test/java/org/elasticsearch/discovery/MasterDisruptionIT.java b/server/src/test/java/org/elasticsearch/discovery/MasterDisruptionIT.java index 718904eecb5..fc9450e9826 100644 --- a/server/src/test/java/org/elasticsearch/discovery/MasterDisruptionIT.java +++ b/server/src/test/java/org/elasticsearch/discovery/MasterDisruptionIT.java @@ -30,7 +30,6 @@ import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; -import org.elasticsearch.common.Strings; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; @@ -71,76 +70,6 @@ import static org.hamcrest.Matchers.nullValue; @ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, transportClientRatio = 0) public class MasterDisruptionIT extends AbstractDisruptionTestCase { - /** - * Test that no split brain occurs under partial network partition. See https://github.com/elastic/elasticsearch/issues/2488 - */ - public void testFailWithMinimumMasterNodesConfigured() throws Exception { - List nodes = startCluster(3); - - // Figure out what is the elected master node - final String masterNode = internalCluster().getMasterName(); - logger.info("---> legit elected master node={}", masterNode); - - // Pick a node that isn't the elected master. - Set nonMasters = new HashSet<>(nodes); - nonMasters.remove(masterNode); - final String unluckyNode = randomFrom(nonMasters.toArray(Strings.EMPTY_ARRAY)); - - - // Simulate a network issue between the unlucky node and elected master node in both directions. - - NetworkDisruption networkDisconnect = new NetworkDisruption( - new NetworkDisruption.TwoPartitions(masterNode, unluckyNode), - new NetworkDisruption.NetworkDisconnect()); - setDisruptionScheme(networkDisconnect); - networkDisconnect.startDisrupting(); - - // Wait until elected master has removed that the unlucky node... - ensureStableCluster(2, masterNode); - - // The unlucky node must report *no* master node, since it can't connect to master and in fact it should - // continuously ping until network failures have been resolved. However - // It may a take a bit before the node detects it has been cut off from the elected master - assertNoMaster(unluckyNode); - - networkDisconnect.stopDisrupting(); - - // Wait until the master node sees all 3 nodes again. - ensureStableCluster(3); - - // The elected master shouldn't have changed, since the unlucky node never could have elected himself as - // master since m_m_n of 2 could never be satisfied. - assertMaster(masterNode, nodes); - } - - /** - * Verify that nodes fault detection works after master (re) election - */ - public void testNodesFDAfterMasterReelection() throws Exception { - startCluster(4); - - logger.info("--> stopping current master"); - internalCluster().stopCurrentMasterNode(); - - ensureStableCluster(3); - - String master = internalCluster().getMasterName(); - String nonMaster = null; - for (String node : internalCluster().getNodeNames()) { - if (!node.equals(master)) { - nonMaster = node; - } - } - - logger.info("--> isolating [{}]", nonMaster); - NetworkDisruption.TwoPartitions partitions = isolateNode(nonMaster); - NetworkDisruption networkDisruption = addRandomDisruptionType(partitions); - networkDisruption.startDisrupting(); - - logger.info("--> waiting for master to remove it"); - ensureStableCluster(2, master); - } - /** * Tests that emulates a frozen elected master node that unfreezes and pushes his cluster state to other nodes * that already are following another elected master node. These nodes should reject this cluster state and prevent diff --git a/server/src/test/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java b/server/src/test/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java new file mode 100644 index 00000000000..b5177b1ce3e --- /dev/null +++ b/server/src/test/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java @@ -0,0 +1,170 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.discovery; + +import org.elasticsearch.action.admin.cluster.state.ClusterStateRequest; +import org.elasticsearch.cluster.coordination.FollowersChecker; +import org.elasticsearch.cluster.coordination.LeaderChecker; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.set.Sets; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.test.disruption.NetworkDisruption; +import org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect; +import org.elasticsearch.test.disruption.NetworkDisruption.NetworkUnresponsive; +import org.elasticsearch.test.junit.annotations.TestLogging; +import org.elasticsearch.test.transport.MockTransportService.TestPlugin; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static java.util.Collections.singleton; +import static org.hamcrest.Matchers.equalTo; + +/** + * Tests relating to the loss of the master, but which work with the default fault detection settings which are rather lenient and will + * not detect a master failure too quickly. + */ +@TestLogging("_root:DEBUG,org.elasticsearch.cluster.service:TRACE") +@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, transportClientRatio = 0) +public class StableMasterDisruptionIT extends ESIntegTestCase { + + @Override + protected Collection> nodePlugins() { + return Collections.singletonList(TestPlugin.class); + } + + /** + * Test that no split brain occurs under partial network partition. See https://github.com/elastic/elasticsearch/issues/2488 + */ + public void testFailWithMinimumMasterNodesConfigured() throws Exception { + List nodes = internalCluster().startNodes(3); + ensureStableCluster(3); + + // Figure out what is the elected master node + final String masterNode = internalCluster().getMasterName(); + logger.info("---> legit elected master node={}", masterNode); + + // Pick a node that isn't the elected master. + Set nonMasters = new HashSet<>(nodes); + nonMasters.remove(masterNode); + final String unluckyNode = randomFrom(nonMasters.toArray(Strings.EMPTY_ARRAY)); + + // Simulate a network issue between the unlucky node and elected master node in both directions. + + NetworkDisruption networkDisconnect = new NetworkDisruption( + new NetworkDisruption.TwoPartitions(masterNode, unluckyNode), + new NetworkDisruption.NetworkDisconnect()); + setDisruptionScheme(networkDisconnect); + networkDisconnect.startDisrupting(); + + // Wait until elected master has removed that the unlucky node... + ensureStableCluster(2, masterNode); + + // The unlucky node must report *no* master node, since it can't connect to master and in fact it should + // continuously ping until network failures have been resolved. However + // It may a take a bit before the node detects it has been cut off from the elected master + assertBusy(() -> assertNull(client(unluckyNode).admin().cluster().state( + new ClusterStateRequest().local(true)).get().getState().nodes().getMasterNode())); + + networkDisconnect.stopDisrupting(); + + // Wait until the master node sees all 3 nodes again. + ensureStableCluster(3); + + // The elected master shouldn't have changed, since the unlucky node never could have elected itself as master + assertThat(internalCluster().getMasterName(), equalTo(masterNode)); + } + + /** + * Verify that nodes fault detection works after master (re) election + */ + public void testFollowerCheckerDetectsUnresponsiveNodeAfterMasterReelection() throws Exception { + internalCluster().startNodes(4, + Settings.builder() + .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s") + .put(LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING.getKey(), "10") + .put(FollowersChecker.FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), "1s") + .put(FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), 1).build()); + ensureStableCluster(4); + + logger.info("--> stopping current master"); + internalCluster().stopCurrentMasterNode(); + + ensureStableCluster(3); + + final String master = internalCluster().getMasterName(); + final List nonMasters = Arrays.stream(internalCluster().getNodeNames()).filter(n -> master.equals(n) == false) + .collect(Collectors.toList()); + final String isolatedNode = randomFrom(nonMasters); + final String otherNode = nonMasters.get(nonMasters.get(0).equals(isolatedNode) ? 1 : 0); + + logger.info("--> isolating [{}]", isolatedNode); + + final NetworkDisruption networkDisruption = new NetworkDisruption(new NetworkDisruption.TwoPartitions( + singleton(isolatedNode), Sets.newHashSet(master, otherNode)), new NetworkUnresponsive()); + setDisruptionScheme(networkDisruption); + networkDisruption.startDisrupting(); + + logger.info("--> waiting for master to remove it"); + ensureStableCluster(2, master); + + networkDisruption.stopDisrupting(); + ensureStableCluster(3); + } + + /** + * Verify that nodes fault detection works after master (re) election + */ + public void testFollowerCheckerDetectsDisconnectedNodeAfterMasterReelection() throws Exception { + internalCluster().startNodes(4); + ensureStableCluster(4); + + logger.info("--> stopping current master"); + internalCluster().stopCurrentMasterNode(); + + ensureStableCluster(3); + + final String master = internalCluster().getMasterName(); + final List nonMasters = Arrays.stream(internalCluster().getNodeNames()).filter(n -> master.equals(n) == false) + .collect(Collectors.toList()); + final String isolatedNode = randomFrom(nonMasters); + final String otherNode = nonMasters.get(nonMasters.get(0).equals(isolatedNode) ? 1 : 0); + + logger.info("--> isolating [{}]", isolatedNode); + + final NetworkDisruption networkDisruption = new NetworkDisruption(new NetworkDisruption.TwoPartitions( + singleton(isolatedNode), Stream.of(master, otherNode).collect(Collectors.toSet())), new NetworkDisconnect()); + setDisruptionScheme(networkDisruption); + networkDisruption.startDisrupting(); + + logger.info("--> waiting for master to remove it"); + ensureStableCluster(2, master); + + networkDisruption.stopDisrupting(); + ensureStableCluster(3); + } +}