diff --git a/core/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java b/core/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java index 32fe6b1e408..e15c869c6ff 100644 --- a/core/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java +++ b/core/src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java @@ -244,7 +244,7 @@ public class TransportMasterNodeActionTests extends ESTestCase { Request request = new Request(); PlainActionFuture listener = new PlainActionFuture<>(); - setState(clusterService, ClusterStateCreationUtils.state(localNode, randomFrom(null, localNode, remoteNode), allNodes)); + setState(clusterService, ClusterStateCreationUtils.state(localNode, randomFrom(localNode, remoteNode, null), allNodes)); new Action(Settings.EMPTY, "testAction", transportService, clusterService, threadPool) { @Override diff --git a/core/src/test/java/org/elasticsearch/common/BooleansTests.java b/core/src/test/java/org/elasticsearch/common/BooleansTests.java index 6e5446cebf9..176c4c75dc7 100644 --- a/core/src/test/java/org/elasticsearch/common/BooleansTests.java +++ b/core/src/test/java/org/elasticsearch/common/BooleansTests.java @@ -51,9 +51,9 @@ public class BooleansTests extends ESTestCase { assertThat(Booleans.parseBoolean(null, false), is(false)); assertThat(Booleans.parseBoolean(null, true), is(true)); - assertThat(Booleans.parseBoolean(randomFrom("true", "on", "yes", "1"), randomFrom(null, Boolean.TRUE, Boolean.FALSE)), is(true)); - assertThat(Booleans.parseBoolean(randomFrom("false", "off", "no", "0"), randomFrom(null, Boolean.TRUE, Boolean.FALSE)), is(false)); - assertThat(Booleans.parseBoolean(randomFrom("true", "on", "yes").toUpperCase(Locale.ROOT),randomFrom(null, Boolean.TRUE, Boolean.FALSE)), is(true)); + assertThat(Booleans.parseBoolean(randomFrom("true", "on", "yes", "1"), randomFrom(Boolean.TRUE, Boolean.FALSE, null)), is(true)); + assertThat(Booleans.parseBoolean(randomFrom("false", "off", "no", "0"), randomFrom(Boolean.TRUE, Boolean.FALSE, null)), is(false)); + assertThat(Booleans.parseBoolean(randomFrom("true", "on", "yes").toUpperCase(Locale.ROOT),randomFrom(Boolean.TRUE, Boolean.FALSE, null)), is(true)); assertThat(Booleans.parseBoolean(null, Boolean.FALSE), is(false)); assertThat(Booleans.parseBoolean(null, Boolean.TRUE), is(true)); assertThat(Booleans.parseBoolean(null, null), nullValue()); @@ -70,7 +70,7 @@ public class BooleansTests extends ESTestCase { assertThat(Booleans.parseBooleanExact(randomFrom("true", "on", "yes", "1")), is(true)); assertThat(Booleans.parseBooleanExact(randomFrom("false", "off", "no", "0")), is(false)); try { - Booleans.parseBooleanExact(randomFrom(null, "fred", "foo", "barney")); + Booleans.parseBooleanExact(randomFrom("fred", "foo", "barney", null)); fail("Expected exception while parsing invalid boolean value "); } catch (Exception ex) { assertTrue(ex instanceof IllegalArgumentException); diff --git a/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java b/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java index 0187bb28f36..7f89acd169e 100644 --- a/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java +++ b/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java @@ -62,6 +62,7 @@ import org.elasticsearch.test.ESIntegTestCase.Scope; import org.elasticsearch.test.InternalTestCluster; import org.elasticsearch.test.discovery.ClusterDiscoveryConfiguration; import org.elasticsearch.test.disruption.BlockClusterStateProcessing; +import org.elasticsearch.test.disruption.BridgePartition; import org.elasticsearch.test.disruption.IntermittentLongGCDisruption; import org.elasticsearch.test.disruption.LongGCDisruption; import org.elasticsearch.test.disruption.NetworkDelaysPartition; @@ -447,8 +448,7 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase { final int seconds = !(TEST_NIGHTLY && rarely()) ? 1 : 5; final String timeout = seconds + "s"; - // TODO: add node count randomizaion - final List nodes = startCluster(3); + final List nodes = startCluster(rarely() ? 5 : 3); assertAcked(prepareCreate("test") .setSettings(Settings.builder() @@ -540,7 +540,7 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase { logger.info("stopping disruption"); disruptionScheme.stopDisrupting(); for (String node : internalCluster().getNodeNames()) { - ensureStableCluster(3, TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() + + ensureStableCluster(nodes.size(), TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() + DISRUPTION_HEALING_OVERHEAD.millis()), true, node); } ensureGreen("test"); @@ -548,7 +548,7 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase { logger.info("validating successful docs"); for (String node : nodes) { try { - logger.debug("validating through node [{}]", node); + logger.debug("validating through node [{}] ([{}] acked docs)", node, ackedDocs.size()); for (String id : ackedDocs.keySet()) { assertTrue("doc [" + id + "] indexed via node [" + ackedDocs.get(id) + "] not found", client(node).prepareGet("test", "type", id).setPreference("_local").get().isExists()); @@ -1192,7 +1192,8 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase { new NetworkUnresponsivePartition(random()), new NetworkDelaysPartition(random()), new NetworkDisconnectPartition(random()), - new SlowClusterStateProcessing(random()) + new SlowClusterStateProcessing(random()), + new BridgePartition(random(), randomBoolean()) ); Collections.shuffle(list, random()); setDisruptionScheme(list.get(0)); diff --git a/docs/resiliency/index.asciidoc b/docs/resiliency/index.asciidoc index 6f3ed169709..802c380b4a7 100644 --- a/docs/resiliency/index.asciidoc +++ b/docs/resiliency/index.asciidoc @@ -55,6 +55,14 @@ If you encounter an issue, https://github.com/elastic/elasticsearch/issues[pleas We are committed to tracking down and fixing all the issues that are posted. +[float] +==== Jepsen Tests + +The Jepsen platform is specifically designed to test distributed systems. It is not a single test and is regularly adapted +to create new scenarios. We have ported all published scenarios to our testing infrastructure. Of course +as the system evolves, new scenarios can come up that are not yet covered. We are committed to investigating all new scenarios and will +report issues that we find on this page and in our GitHub repository. + [float] === Better request retry mechanism when nodes are disconnected (STATUS: ONGOING) @@ -102,17 +110,31 @@ Indices stats and indices segments requests reach out to all nodes that have sha while the stats request arrives will make that part of the request fail and are just ignored in the overall stats result. {GIT}13719[#13719] [float] -=== Jepsen Test Failures (STATUS: ONGOING) +=== Documentation of guarantees and handling of failures (STATUS: ONGOING) -We have increased our test coverage to include scenarios tested by Jepsen. We make heavy use of randomization to expand on the scenarios that can be tested and to introduce new error conditions. You can follow the work on the master branch of the https://github.com/elastic/elasticsearch/blob/master/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java[`DiscoveryWithServiceDisruptionsIT` class], where we will add more tests as time progresses. +This status page is a start, but we can do a better job of explicitly documenting the processes at work in Elasticsearch and what happens +in the case of each type of failure. The plan is to have a test case that validates each behavior under simulated conditions. Every test + will document the expected results, the associated test code, and an explicit PASS or FAIL status for each simulated case. [float] -=== Document guarantees and handling of failure (STATUS: ONGOING) +=== Run Jepsen (STATUS: ONGOING) + +We have ported all of the known scenarios in the Jepsen blogs to our testing infrastructure. The new tests are run continuously in our +testing farm and are passing. We are also working on running Jepsen independently to verify that no failures are found. -This status page is a start, but we can do a better job of explicitly documenting the processes at work in Elasticsearch, and what happens in the case of each type of failure. The plan is to have a test case that validates each behavior under simulated conditions. Every test will document the expected results, the associated test code and an explicit PASS or FAIL status for each simulated case. == Unreleased +[float] +=== Port Jepsen tests to our testing framework (STATUS: UNRELEASED, V5.0.0) + +We have increased our test coverage to include scenarios tested by Jepsen, as described in the Elasticsearch related blogs. We make heavy +use of randomization to expand on the scenarios that can be tested and to introduce new error conditions. +You can follow the work on the master branch of the +https://github.com/elastic/elasticsearch/blob/master/core/src/test/java/org/elasticsearch/discovery/DiscoveryWithServiceDisruptionsIT.java[`DiscoveryWithServiceDisruptionsIT` class], +where the `testAckedIndexing` test was specifically added to cover known Jepsen related scenarios. + + [float] === Loss of documents during network partition (STATUS: UNRELEASED, v5.0.0) diff --git a/modules/reindex/src/test/java/org/elasticsearch/index/reindex/ReindexScriptTests.java b/modules/reindex/src/test/java/org/elasticsearch/index/reindex/ReindexScriptTests.java index 74b7548cd63..c70b80b8e37 100644 --- a/modules/reindex/src/test/java/org/elasticsearch/index/reindex/ReindexScriptTests.java +++ b/modules/reindex/src/test/java/org/elasticsearch/index/reindex/ReindexScriptTests.java @@ -22,7 +22,6 @@ package org.elasticsearch.index.reindex; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.common.lucene.uid.Versions; -import org.elasticsearch.script.ExecutableScript; import org.elasticsearch.script.ScriptService; import java.util.Map; @@ -106,7 +105,7 @@ public class ReindexScriptTests extends AbstractAsyncBulkIndexByScrollActionScri } public void testSetTimestamp() throws Exception { - String timestamp = randomFrom(null, "now", "1234"); + String timestamp = randomFrom("now", "1234", null); IndexRequest index = applyScript((Map ctx) -> ctx.put("_timestamp", timestamp)); assertEquals(timestamp, index.timestamp()); } diff --git a/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java b/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java index d04d12304de..6d63b6a5428 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java @@ -92,6 +92,7 @@ import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ExecutorService; @@ -344,9 +345,15 @@ public abstract class ESTestCase extends LuceneTestCase { /** Pick a random object from the given array. The array must not be empty. */ public static T randomFrom(T... array) { - return RandomPicks.randomFrom(random(), array); + return randomFrom(random(), array); } + /** Pick a random object from the given array. The array must not be empty. */ + public static T randomFrom(Random random, T... array) { + return RandomPicks.randomFrom(random, array); + } + + /** Pick a random object from the given list. */ public static T randomFrom(List list) { return RandomPicks.randomFrom(random(), list); diff --git a/test/framework/src/main/java/org/elasticsearch/test/disruption/BridgePartition.java b/test/framework/src/main/java/org/elasticsearch/test/disruption/BridgePartition.java new file mode 100644 index 00000000000..1a9c2b686c3 --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/test/disruption/BridgePartition.java @@ -0,0 +1,74 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.test.disruption; + +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.test.InternalTestCluster; +import org.elasticsearch.test.transport.MockTransportService; + +import java.util.Random; + +import static org.elasticsearch.test.ESTestCase.randomFrom; + +/** + * A partition that breaks the cluster into two groups of nodes. The two groups are fully isolated + * with the exception of a single node that can see and be seen by all nodes in both groups. + */ +public class BridgePartition extends NetworkPartition { + + String bridgeNode; + final boolean unresponsive; + + public BridgePartition(Random random, boolean unresponsive) { + super(random); + this.unresponsive = unresponsive; + } + + @Override + public void applyToCluster(InternalTestCluster cluster) { + bridgeNode = randomFrom(random, cluster.getNodeNames()); + this.cluster = cluster; + for (String node: cluster.getNodeNames()) { + if (node.equals(bridgeNode) == false) { + super.applyToNode(node, cluster); + } + } + } + + @Override + public TimeValue expectedTimeToHeal() { + return TimeValue.timeValueSeconds(0); + } + + @Override + void applyDisruption(MockTransportService transportService1, MockTransportService transportService2) { + if (unresponsive) { + transportService1.addUnresponsiveRule(transportService2); + transportService2.addUnresponsiveRule(transportService1); + } else { + transportService1.addFailToSendNoConnectRule(transportService2); + transportService2.addFailToSendNoConnectRule(transportService1); + } + } + + @Override + protected String getPartitionDescription() { + return "bridge (super connected node: [" + bridgeNode + "], unresponsive [" + unresponsive + "])"; + } +}