Add a BridgePartition to be used by testAckedIndexing (#19172)
We have long worked to capture different partitioning scenarios in our testing infra. This PR adds a new variant, inspired by the Jepsen blogs, which was forgotten far - namely a partition where one node can still see and be seen by all other nodes. It also updates the resiliency page to better reflect all the work that was done in this area.
This commit is contained in:
parent
5a7dfe78c2
commit
09ca6d6ed2
|
@ -244,7 +244,7 @@ public class TransportMasterNodeActionTests extends ESTestCase {
|
|||
Request request = new Request();
|
||||
PlainActionFuture<Response> listener = new PlainActionFuture<>();
|
||||
|
||||
setState(clusterService, ClusterStateCreationUtils.state(localNode, randomFrom(null, localNode, remoteNode), allNodes));
|
||||
setState(clusterService, ClusterStateCreationUtils.state(localNode, randomFrom(localNode, remoteNode, null), allNodes));
|
||||
|
||||
new Action(Settings.EMPTY, "testAction", transportService, clusterService, threadPool) {
|
||||
@Override
|
||||
|
|
|
@ -51,9 +51,9 @@ public class BooleansTests extends ESTestCase {
|
|||
assertThat(Booleans.parseBoolean(null, false), is(false));
|
||||
assertThat(Booleans.parseBoolean(null, true), is(true));
|
||||
|
||||
assertThat(Booleans.parseBoolean(randomFrom("true", "on", "yes", "1"), randomFrom(null, Boolean.TRUE, Boolean.FALSE)), is(true));
|
||||
assertThat(Booleans.parseBoolean(randomFrom("false", "off", "no", "0"), randomFrom(null, Boolean.TRUE, Boolean.FALSE)), is(false));
|
||||
assertThat(Booleans.parseBoolean(randomFrom("true", "on", "yes").toUpperCase(Locale.ROOT),randomFrom(null, Boolean.TRUE, Boolean.FALSE)), is(true));
|
||||
assertThat(Booleans.parseBoolean(randomFrom("true", "on", "yes", "1"), randomFrom(Boolean.TRUE, Boolean.FALSE, null)), is(true));
|
||||
assertThat(Booleans.parseBoolean(randomFrom("false", "off", "no", "0"), randomFrom(Boolean.TRUE, Boolean.FALSE, null)), is(false));
|
||||
assertThat(Booleans.parseBoolean(randomFrom("true", "on", "yes").toUpperCase(Locale.ROOT),randomFrom(Boolean.TRUE, Boolean.FALSE, null)), is(true));
|
||||
assertThat(Booleans.parseBoolean(null, Boolean.FALSE), is(false));
|
||||
assertThat(Booleans.parseBoolean(null, Boolean.TRUE), is(true));
|
||||
assertThat(Booleans.parseBoolean(null, null), nullValue());
|
||||
|
@ -70,7 +70,7 @@ public class BooleansTests extends ESTestCase {
|
|||
assertThat(Booleans.parseBooleanExact(randomFrom("true", "on", "yes", "1")), is(true));
|
||||
assertThat(Booleans.parseBooleanExact(randomFrom("false", "off", "no", "0")), is(false));
|
||||
try {
|
||||
Booleans.parseBooleanExact(randomFrom(null, "fred", "foo", "barney"));
|
||||
Booleans.parseBooleanExact(randomFrom("fred", "foo", "barney", null));
|
||||
fail("Expected exception while parsing invalid boolean value ");
|
||||
} catch (Exception ex) {
|
||||
assertTrue(ex instanceof IllegalArgumentException);
|
||||
|
|
|
@ -62,6 +62,7 @@ import org.elasticsearch.test.ESIntegTestCase.Scope;
|
|||
import org.elasticsearch.test.InternalTestCluster;
|
||||
import org.elasticsearch.test.discovery.ClusterDiscoveryConfiguration;
|
||||
import org.elasticsearch.test.disruption.BlockClusterStateProcessing;
|
||||
import org.elasticsearch.test.disruption.BridgePartition;
|
||||
import org.elasticsearch.test.disruption.IntermittentLongGCDisruption;
|
||||
import org.elasticsearch.test.disruption.LongGCDisruption;
|
||||
import org.elasticsearch.test.disruption.NetworkDelaysPartition;
|
||||
|
@ -447,8 +448,7 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase {
|
|||
final int seconds = !(TEST_NIGHTLY && rarely()) ? 1 : 5;
|
||||
final String timeout = seconds + "s";
|
||||
|
||||
// TODO: add node count randomizaion
|
||||
final List<String> nodes = startCluster(3);
|
||||
final List<String> nodes = startCluster(rarely() ? 5 : 3);
|
||||
|
||||
assertAcked(prepareCreate("test")
|
||||
.setSettings(Settings.builder()
|
||||
|
@ -540,7 +540,7 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase {
|
|||
logger.info("stopping disruption");
|
||||
disruptionScheme.stopDisrupting();
|
||||
for (String node : internalCluster().getNodeNames()) {
|
||||
ensureStableCluster(3, TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() +
|
||||
ensureStableCluster(nodes.size(), TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() +
|
||||
DISRUPTION_HEALING_OVERHEAD.millis()), true, node);
|
||||
}
|
||||
ensureGreen("test");
|
||||
|
@ -548,7 +548,7 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase {
|
|||
logger.info("validating successful docs");
|
||||
for (String node : nodes) {
|
||||
try {
|
||||
logger.debug("validating through node [{}]", node);
|
||||
logger.debug("validating through node [{}] ([{}] acked docs)", node, ackedDocs.size());
|
||||
for (String id : ackedDocs.keySet()) {
|
||||
assertTrue("doc [" + id + "] indexed via node [" + ackedDocs.get(id) + "] not found",
|
||||
client(node).prepareGet("test", "type", id).setPreference("_local").get().isExists());
|
||||
|
@ -1192,7 +1192,8 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase {
|
|||
new NetworkUnresponsivePartition(random()),
|
||||
new NetworkDelaysPartition(random()),
|
||||
new NetworkDisconnectPartition(random()),
|
||||
new SlowClusterStateProcessing(random())
|
||||
new SlowClusterStateProcessing(random()),
|
||||
new BridgePartition(random(), randomBoolean())
|
||||
);
|
||||
Collections.shuffle(list, random());
|
||||
setDisruptionScheme(list.get(0));
|
||||
|
|
|
@ -55,6 +55,14 @@ If you encounter an issue, https://github.com/elastic/elasticsearch/issues[pleas
|
|||
|
||||
We are committed to tracking down and fixing all the issues that are posted.
|
||||
|
||||
[float]
|
||||
==== Jepsen Tests
|
||||
|
||||
The Jepsen platform is specifically designed to test distributed systems. It is not a single test and is regularly adapted
|
||||
to create new scenarios. We have ported all published scenarios to our testing infrastructure. Of course
|
||||
as the system evolves, new scenarios can come up that are not yet covered. We are committed to investigating all new scenarios and will
|
||||
report issues that we find on this page and in our GitHub repository.
|
||||
|
||||
[float]
|
||||
=== Better request retry mechanism when nodes are disconnected (STATUS: ONGOING)
|
||||
|
||||
|
@ -102,17 +110,31 @@ Indices stats and indices segments requests reach out to all nodes that have sha
|
|||
while the stats request arrives will make that part of the request fail and are just ignored in the overall stats result. {GIT}13719[#13719]
|
||||
|
||||
[float]
|
||||
=== Jepsen Test Failures (STATUS: ONGOING)
|
||||
=== Documentation of guarantees and handling of failures (STATUS: ONGOING)
|
||||
|
||||
We have increased our test coverage to include scenarios tested by Jepsen. We make heavy use of randomization to expand on the scenarios that can be tested and to introduce new error conditions. You can follow the work on the master branch of the https://github.com/elastic/elasticsearch/blob/master/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java[`DiscoveryWithServiceDisruptionsIT` class], where we will add more tests as time progresses.
|
||||
This status page is a start, but we can do a better job of explicitly documenting the processes at work in Elasticsearch and what happens
|
||||
in the case of each type of failure. The plan is to have a test case that validates each behavior under simulated conditions. Every test
|
||||
will document the expected results, the associated test code, and an explicit PASS or FAIL status for each simulated case.
|
||||
|
||||
[float]
|
||||
=== Document guarantees and handling of failure (STATUS: ONGOING)
|
||||
=== Run Jepsen (STATUS: ONGOING)
|
||||
|
||||
We have ported all of the known scenarios in the Jepsen blogs to our testing infrastructure. The new tests are run continuously in our
|
||||
testing farm and are passing. We are also working on running Jepsen independently to verify that no failures are found.
|
||||
|
||||
This status page is a start, but we can do a better job of explicitly documenting the processes at work in Elasticsearch, and what happens in the case of each type of failure. The plan is to have a test case that validates each behavior under simulated conditions. Every test will document the expected results, the associated test code and an explicit PASS or FAIL status for each simulated case.
|
||||
|
||||
== Unreleased
|
||||
|
||||
[float]
|
||||
=== Port Jepsen tests to our testing framework (STATUS: UNRELEASED, V5.0.0)
|
||||
|
||||
We have increased our test coverage to include scenarios tested by Jepsen, as described in the Elasticsearch related blogs. We make heavy
|
||||
use of randomization to expand on the scenarios that can be tested and to introduce new error conditions.
|
||||
You can follow the work on the master branch of the
|
||||
https://github.com/elastic/elasticsearch/blob/master/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java[`DiscoveryWithServiceDisruptionsIT` class],
|
||||
where the `testAckedIndexing` test was specifically added to cover known Jepsen related scenarios.
|
||||
|
||||
|
||||
[float]
|
||||
=== Loss of documents during network partition (STATUS: UNRELEASED, v5.0.0)
|
||||
|
||||
|
|
|
@ -22,7 +22,6 @@ package org.elasticsearch.index.reindex;
|
|||
import org.elasticsearch.action.index.IndexRequest;
|
||||
import org.elasticsearch.action.search.SearchRequest;
|
||||
import org.elasticsearch.common.lucene.uid.Versions;
|
||||
import org.elasticsearch.script.ExecutableScript;
|
||||
import org.elasticsearch.script.ScriptService;
|
||||
|
||||
import java.util.Map;
|
||||
|
@ -106,7 +105,7 @@ public class ReindexScriptTests extends AbstractAsyncBulkIndexByScrollActionScri
|
|||
}
|
||||
|
||||
public void testSetTimestamp() throws Exception {
|
||||
String timestamp = randomFrom(null, "now", "1234");
|
||||
String timestamp = randomFrom("now", "1234", null);
|
||||
IndexRequest index = applyScript((Map<String, Object> ctx) -> ctx.put("_timestamp", timestamp));
|
||||
assertEquals(timestamp, index.timestamp());
|
||||
}
|
||||
|
|
|
@ -92,6 +92,7 @@ import java.util.Collections;
|
|||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
|
@ -344,9 +345,15 @@ public abstract class ESTestCase extends LuceneTestCase {
|
|||
|
||||
/** Pick a random object from the given array. The array must not be empty. */
|
||||
public static <T> T randomFrom(T... array) {
|
||||
return RandomPicks.randomFrom(random(), array);
|
||||
return randomFrom(random(), array);
|
||||
}
|
||||
|
||||
/** Pick a random object from the given array. The array must not be empty. */
|
||||
public static <T> T randomFrom(Random random, T... array) {
|
||||
return RandomPicks.randomFrom(random, array);
|
||||
}
|
||||
|
||||
|
||||
/** Pick a random object from the given list. */
|
||||
public static <T> T randomFrom(List<T> list) {
|
||||
return RandomPicks.randomFrom(random(), list);
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.test.disruption;
|
||||
|
||||
import org.elasticsearch.common.unit.TimeValue;
|
||||
import org.elasticsearch.test.InternalTestCluster;
|
||||
import org.elasticsearch.test.transport.MockTransportService;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
import static org.elasticsearch.test.ESTestCase.randomFrom;
|
||||
|
||||
/**
|
||||
* A partition that breaks the cluster into two groups of nodes. The two groups are fully isolated
|
||||
* with the exception of a single node that can see and be seen by all nodes in both groups.
|
||||
*/
|
||||
public class BridgePartition extends NetworkPartition {
|
||||
|
||||
String bridgeNode;
|
||||
final boolean unresponsive;
|
||||
|
||||
public BridgePartition(Random random, boolean unresponsive) {
|
||||
super(random);
|
||||
this.unresponsive = unresponsive;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void applyToCluster(InternalTestCluster cluster) {
|
||||
bridgeNode = randomFrom(random, cluster.getNodeNames());
|
||||
this.cluster = cluster;
|
||||
for (String node: cluster.getNodeNames()) {
|
||||
if (node.equals(bridgeNode) == false) {
|
||||
super.applyToNode(node, cluster);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimeValue expectedTimeToHeal() {
|
||||
return TimeValue.timeValueSeconds(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
void applyDisruption(MockTransportService transportService1, MockTransportService transportService2) {
|
||||
if (unresponsive) {
|
||||
transportService1.addUnresponsiveRule(transportService2);
|
||||
transportService2.addUnresponsiveRule(transportService1);
|
||||
} else {
|
||||
transportService1.addFailToSendNoConnectRule(transportService2);
|
||||
transportService2.addFailToSendNoConnectRule(transportService1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getPartitionDescription() {
|
||||
return "bridge (super connected node: [" + bridgeNode + "], unresponsive [" + unresponsive + "])";
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue