Fix testCannotJoinIfMasterLostDataFolder

Relates to #41047
This commit is contained in:
Yannick Welsch 2019-05-22 14:36:17 +02:00
parent a79cd77e5c
commit 250973af1d
1 changed files with 18 additions and 5 deletions

View File

@ -31,6 +31,7 @@ import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.action.shard.ShardStateAction;
import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
import org.elasticsearch.cluster.coordination.LagDetector;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.routing.Murmur3HashFunction;
import org.elasticsearch.cluster.routing.ShardRouting;
@ -388,7 +389,6 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
}
}
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/41047")
public void testCannotJoinIfMasterLostDataFolder() throws Exception {
String masterNode = internalCluster().startMasterOnlyNode();
String dataNode = internalCluster().startDataOnlyNode();
@ -401,7 +401,18 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
@Override
public Settings onNodeStopped(String nodeName) {
return Settings.builder().put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName).build();
return Settings.builder()
.put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName)
/*
* the data node might join while the master is still not fully established as master just yet and bypasses the join
* validation that is done before adding the node to the cluster. Only the join validation when handling the publish
* request takes place, but at this point the cluster state has been successfully committed, and will subsequently be
* exposed to the applier. The health check below therefore sees the cluster state with the 2 nodes and thinks all is
* good, even though the data node never accepted this state. What's worse is that it takes 90 seconds for the data
* node to be kicked out of the cluster (lag detection). We speed this up here.
*/
.put(LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING.getKey(), "10s")
.build();
}
@Override
@ -410,9 +421,11 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
}
});
assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut());
assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get()
.isTimedOut());
assertBusy(() -> {
assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut());
assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get()
.isTimedOut());
}, 30, TimeUnit.SECONDS);
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(dataNode)); // otherwise we will fail during clean-up
}