From 06b8a1ada7844162e3fe114eafb16052415897e8 Mon Sep 17 00:00:00 2001
From: Boaz Leskes <b.leskes@gmail.com>
Date: Wed, 1 Feb 2017 10:22:02 +0100
Subject: [PATCH] fix testCannotAllocateStaleReplicaExplanation node management

The test tried to create a situation where a stale replica is the only shard available. It did so by stopping the node with the replica, indexing some, stopping the primary node, starting a new node. This is flawed because the newly started node may reuse the data path of the primary node and things go back to green. Instead we should make sure that the replica is on the path that will be selected when the new node is started (i.e., the path with the smaller ordinal)
---
 .../ClusterAllocationExplainIT.java           | 32 +++++++++++++------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/core/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainIT.java b/core/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainIT.java
index b6f225c8c71..87b7066c38f 100644
--- a/core/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainIT.java
+++ b/core/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainIT.java
@@ -52,6 +52,7 @@ import java.util.Map;
 import java.util.Set;
 
 import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.isOneOf;
@@ -1012,28 +1013,39 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
     @AwaitsFix(bugUrl = "")
     public void testCannotAllocateStaleReplicaExplanation() throws Exception {
         logger.info("--> starting 3 nodes");
-        internalCluster().startNodes(3);
-        String masterNode = internalCluster().getMasterName();
+        final String masterNode = internalCluster().startNode();
+        // start replica node first, so it's path will be used first when we start a node after
+        // stopping all of them at end of test.
+        final String replicaNode = internalCluster().startNode();
+        final String primaryNode = internalCluster().startNode();
 
         logger.info("--> creating an index with 1 primary and 1 replica");
         createIndexAndIndexData(1, 1,
-            Settings.builder().put("index.routing.allocation.exclude._name", masterNode).build(),
-            ActiveShardCount.ALL);
+            Settings.builder()
+                .put("index.routing.allocation.include._name", primaryNode)
+                .put("index.routing.allocation.exclude._name", masterNode)
+                .build(),
+            ActiveShardCount.ONE);
+
+        client().admin().indices().prepareUpdateSettings("idx").setSettings(
+            Settings.builder().put("index.routing.allocation.include._name", (String) null)).get();
+        ensureGreen();
+
+        assertThat(replicaNode().getName(), equalTo(replicaNode));
+        assertThat(primaryNodeName(), equalTo(primaryNode));
 
         logger.info("--> stop node with the replica shard");
-        String stoppedNode = replicaNode().getName();
-        internalCluster().stopRandomNode(InternalTestCluster.nameFilter(stoppedNode));
+        internalCluster().stopRandomNode(InternalTestCluster.nameFilter(replicaNode));
 
         logger.info("--> index more data, now the replica is stale");
         indexData();
 
         logger.info("--> stop the node with the primary");
-        internalCluster().stopRandomNode(InternalTestCluster.nameFilter(primaryNodeName()));
+        internalCluster().stopRandomNode(InternalTestCluster.nameFilter(primaryNode));
 
         logger.info("--> restart the node with the stale replica");
-        client().admin().indices().prepareUpdateSettings("idx").setSettings(
-            Settings.builder().put("index.routing.allocation.include._name", (String) null)).get();
-        String restartedNode = internalCluster().startNode(Settings.builder().put("node.name", stoppedNode).build());
+        String restartedNode = internalCluster().startDataOnlyNode();
+        ensureClusterSizeConsistency(); // wait for the master to finish processing join.
 
         // wait until the system has fetched shard data and we know there is no valid shard copy
         assertBusy(() -> {