Make ensureGreen and ensureYellow wait for cluster size consistency (#21344)
We currently often use ensureGreen or ensureYellow to check whether the cluster is in a good state again after shutting down a node. With the change in #21092, however, it can happen that if the node that is stopped is the master node, another node will become master and publish a cluster state where it is master but where the node that was stopped hasn't been removed yet from the cluster state. It will only publish a second state thereafter where the old master is removed. If the ensureGreen/ensureYellow is timed just right, it will get to execute before the second cluster state update removing the old master and the condition ensureGreen / ensureYellow might not hold at that point anymore.
This commit is contained in:
parent
6b4280c7be
commit
cd34eed03e
|
@ -152,6 +152,7 @@ import java.util.Collections;
|
|||
import java.util.HashSet;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
|
@ -181,6 +182,7 @@ import static org.hamcrest.Matchers.emptyArray;
|
|||
import static org.hamcrest.Matchers.emptyIterable;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.is;
|
||||
import static org.hamcrest.Matchers.lessThanOrEqualTo;
|
||||
import static org.hamcrest.Matchers.notNullValue;
|
||||
import static org.hamcrest.Matchers.startsWith;
|
||||
|
||||
|
@ -869,15 +871,45 @@ public abstract class ESIntegTestCase extends ESTestCase {
|
|||
* @param timeout time out value to set on {@link org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest}
|
||||
*/
|
||||
public ClusterHealthStatus ensureGreen(TimeValue timeout, String... indices) {
|
||||
ClusterHealthResponse actionGet = client().admin().cluster()
|
||||
.health(Requests.clusterHealthRequest(indices).timeout(timeout).waitForGreenStatus().waitForEvents(Priority.LANGUID).waitForNoRelocatingShards(true)).actionGet();
|
||||
return ensureColor(ClusterHealthStatus.GREEN, timeout, indices);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures the cluster has a yellow state via the cluster health API.
|
||||
*/
|
||||
public ClusterHealthStatus ensureYellow(String... indices) {
|
||||
return ensureColor(ClusterHealthStatus.YELLOW, TimeValue.timeValueSeconds(30), indices);
|
||||
}
|
||||
|
||||
private ClusterHealthStatus ensureColor(ClusterHealthStatus clusterHealthStatus, TimeValue timeout, String... indices) {
|
||||
String color = clusterHealthStatus.name().toLowerCase(Locale.ROOT);
|
||||
String method = "ensure" + Strings.capitalize(color);
|
||||
|
||||
ClusterHealthRequest healthRequest = Requests.clusterHealthRequest(indices)
|
||||
.timeout(timeout)
|
||||
.waitForStatus(clusterHealthStatus)
|
||||
.waitForEvents(Priority.LANGUID)
|
||||
.waitForNoRelocatingShards(true)
|
||||
// We currently often use ensureGreen or ensureYellow to check whether the cluster is back in a good state after shutting down
|
||||
// a node. If the node that is stopped is the master node, another node will become master and publish a cluster state where it
|
||||
// is master but where the node that was stopped hasn't been removed yet from the cluster state. It will only subsequently
|
||||
// publish a second state where the old master is removed. If the ensureGreen/ensureYellow is timed just right, it will get to
|
||||
// execute before the second cluster state update removes the old master and the condition ensureGreen / ensureYellow will
|
||||
// trivially hold if it held before the node was shut down. The following "waitForNodes" condition ensures that the node has
|
||||
// been removed by the master so that the health check applies to the set of nodes we expect to be part of the cluster.
|
||||
.waitForNodes(Integer.toString(cluster().size()));
|
||||
|
||||
ClusterHealthResponse actionGet = client().admin().cluster().health(healthRequest).actionGet();
|
||||
if (actionGet.isTimedOut()) {
|
||||
logger.info("ensureGreen timed out, cluster state:\n{}\n{}",
|
||||
client().admin().cluster().prepareState().get().getState(), client().admin().cluster().preparePendingClusterTasks().get());
|
||||
fail("timed out waiting for green state");
|
||||
logger.info("{} timed out, cluster state:\n{}\n{}",
|
||||
method,
|
||||
client().admin().cluster().prepareState().get().getState(),
|
||||
client().admin().cluster().preparePendingClusterTasks().get());
|
||||
fail("timed out waiting for " + color + " state");
|
||||
}
|
||||
assertThat(actionGet.getStatus(), equalTo(ClusterHealthStatus.GREEN));
|
||||
logger.debug("indices {} are green", indices.length == 0 ? "[_all]" : indices);
|
||||
assertThat("Expected at least " + clusterHealthStatus + " but got " + actionGet.getStatus(),
|
||||
actionGet.getStatus().value(), lessThanOrEqualTo(clusterHealthStatus.value()));
|
||||
logger.debug("indices {} are {}", indices.length == 0 ? "[_all]" : indices, color);
|
||||
return actionGet.getStatus();
|
||||
}
|
||||
|
||||
|
@ -991,21 +1023,6 @@ public abstract class ESIntegTestCase extends ESTestCase {
|
|||
.get().isAcknowledged());
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensures the cluster has a yellow state via the cluster health API.
|
||||
*/
|
||||
public ClusterHealthStatus ensureYellow(String... indices) {
|
||||
ClusterHealthResponse actionGet = client().admin().cluster()
|
||||
.health(Requests.clusterHealthRequest(indices).waitForNoRelocatingShards(true).waitForYellowStatus().waitForEvents(Priority.LANGUID)).actionGet();
|
||||
if (actionGet.isTimedOut()) {
|
||||
logger.info("ensureYellow timed out, cluster state:\n{}\n{}",
|
||||
client().admin().cluster().prepareState().get().getState(), client().admin().cluster().preparePendingClusterTasks().get());
|
||||
assertThat("timed out waiting for yellow", actionGet.isTimedOut(), equalTo(false));
|
||||
}
|
||||
logger.debug("indices {} are yellow", indices.length == 0 ? "[_all]" : indices);
|
||||
return actionGet.getStatus();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prints the current cluster state as debug logging.
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue