Strengthen validateClusterFormed check (#49248)

Strengthens the validateClusterFormed check that is used by the test infrastructure to make
sure that nodes are properly connected and know about each other. Is used in situations where
the cluster is scaled up and down, and where there previously was a network disruption that has
been healed.

Closes #49243
This commit is contained in:
Yannick Welsch 2019-11-21 17:37:41 +01:00
parent d9835f7fb4
commit 420825c3b5
2 changed files with 25 additions and 26 deletions

View File

@ -161,9 +161,9 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.junit.Assert.assertEquals;
import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.nullValue; import static org.hamcrest.Matchers.nullValue;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThat; import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
@ -1239,29 +1239,35 @@ public final class InternalTestCluster extends TestCluster {
/** ensure a cluster is formed with all published nodes. */ /** ensure a cluster is formed with all published nodes. */
public synchronized void validateClusterFormed() { public synchronized void validateClusterFormed() {
String name = randomFrom(random, getNodeNames()); final Set<DiscoveryNode> expectedNodes = new HashSet<>();
validateClusterFormed(name);
}
/** ensure a cluster is formed with all published nodes, but do so by using the client of the specified node */
private synchronized void validateClusterFormed(String viaNode) {
Set<DiscoveryNode> expectedNodes = new HashSet<>();
for (NodeAndClient nodeAndClient : nodes.values()) { for (NodeAndClient nodeAndClient : nodes.values()) {
expectedNodes.add(getInstanceFromNode(ClusterService.class, nodeAndClient.node()).localNode()); expectedNodes.add(getInstanceFromNode(ClusterService.class, nodeAndClient.node()).localNode());
} }
logger.trace("validating cluster formed via [{}], expecting {}", viaNode, expectedNodes); logger.trace("validating cluster formed, expecting {}", expectedNodes);
final Client client = client(viaNode);
try { try {
assertBusy(() -> { assertBusy(() -> {
DiscoveryNodes discoveryNodes = client.admin().cluster().prepareState().get().getState().nodes(); final List<ClusterState> states = nodes.values().stream()
assertEquals(expectedNodes.size(), discoveryNodes.getSize()); .map(node -> getInstanceFromNode(ClusterService.class, node.node()))
.map(ClusterService::state)
.collect(Collectors.toList());
final String debugString = ", expected nodes: " + expectedNodes + " and actual cluster states " + states;
// all nodes have a master
assertTrue("Missing master" + debugString, states.stream().allMatch(cs -> cs.nodes().getMasterNodeId() != null));
// all nodes have the same master (in same term)
assertEquals("Not all masters in same term" + debugString, 1,
states.stream().mapToLong(ClusterState::term).distinct().count());
// all nodes know about all other nodes
states.forEach(cs -> {
DiscoveryNodes discoveryNodes = cs.nodes();
assertEquals("Node size mismatch" + debugString, expectedNodes.size(), discoveryNodes.getSize());
for (DiscoveryNode expectedNode : expectedNodes) { for (DiscoveryNode expectedNode : expectedNodes) {
assertTrue("Expected node to exist: " + expectedNode, discoveryNodes.nodeExists(expectedNode)); assertTrue("Expected node to exist: " + expectedNode + debugString, discoveryNodes.nodeExists(expectedNode));
} }
});
}, 30, TimeUnit.SECONDS); }, 30, TimeUnit.SECONDS);
} catch (AssertionError ae) { } catch (AssertionError ae) {
throw new IllegalStateException("cluster failed to form with expected nodes " + expectedNodes + " and actual nodes " + throw new IllegalStateException("cluster failed to form", ae);
client.admin().cluster().prepareState().get().getState().nodes());
} catch (Exception e) { } catch (Exception e) {
throw new IllegalStateException(e); throw new IllegalStateException(e);
} }
@ -1821,8 +1827,7 @@ public final class InternalTestCluster extends TestCluster {
if (callback.validateClusterForming() || excludedNodeIds.isEmpty() == false) { if (callback.validateClusterForming() || excludedNodeIds.isEmpty() == false) {
// we have to validate cluster size to ensure that the restarted node has rejoined the cluster if it was master-eligible; // we have to validate cluster size to ensure that the restarted node has rejoined the cluster if it was master-eligible;
// we have to do this via the node that was just restarted as it may be that the master didn't yet process the fact that it left validateClusterFormed();
validateClusterFormed(nodeAndClient.name);
} }
if (excludedNodeIds.isEmpty() == false) { if (excludedNodeIds.isEmpty() == false) {

View File

@ -20,7 +20,6 @@
package org.elasticsearch.test.disruption; package org.elasticsearch.test.disruption;
import com.carrotsearch.randomizedtesting.generators.RandomPicks; import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.Logger;
import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterState;
@ -41,8 +40,6 @@ import java.util.Set;
import java.util.concurrent.CountDownLatch; import java.util.concurrent.CountDownLatch;
import java.util.function.BiConsumer; import java.util.function.BiConsumer;
import static org.junit.Assert.assertFalse;
/** /**
* Network disruptions are modeled using two components: * Network disruptions are modeled using two components:
* 1) the {@link DisruptedLinks} represents the links in the network that are to be disrupted * 1) the {@link DisruptedLinks} represents the links in the network that are to be disrupted
@ -119,10 +116,7 @@ public class NetworkDisruption implements ServiceDisruptionScheme {
} }
protected void ensureNodeCount(InternalTestCluster cluster) { protected void ensureNodeCount(InternalTestCluster cluster) {
assertFalse("cluster failed to form after disruption was healed", cluster.client().admin().cluster().prepareHealth() cluster.validateClusterFormed();
.setWaitForNodes(String.valueOf(cluster.size()))
.setWaitForNoRelocatingShards(true)
.get().isTimedOut());
} }
@Override @Override