Fix unresponsive network simulation (#42579)
Unresponsive network simulation would throw away requests. However, then we no longer have any guarantees that a transport action either succeeds or fails, which could lead to hangs (example: unclosed IndexShard permits). Closes #42244
This commit is contained in:
parent
d3136f99e6
commit
6108899a2d
|
@ -41,6 +41,7 @@ import org.elasticsearch.common.unit.TimeValue;
|
|||
import org.elasticsearch.common.util.MockPageCacheRecycler;
|
||||
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
|
||||
import org.elasticsearch.common.util.concurrent.RunOnce;
|
||||
import org.elasticsearch.core.internal.io.IOUtils;
|
||||
import org.elasticsearch.indices.breaker.NoneCircuitBreakerService;
|
||||
import org.elasticsearch.node.Node;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
|
@ -68,6 +69,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.CopyOnWriteArrayList;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
|
@ -281,8 +283,25 @@ public final class MockTransportService extends TransportService {
|
|||
return () -> {};
|
||||
});
|
||||
|
||||
transport().addSendBehavior(transportAddress, (connection, requestId, action, request, options) -> {
|
||||
// don't send anything, the receiving node is unresponsive
|
||||
transport().addSendBehavior(transportAddress, new StubbableTransport.SendRequestBehavior() {
|
||||
private Set<Transport.Connection> toClose = ConcurrentHashMap.newKeySet();
|
||||
@Override
|
||||
public void sendRequest(Transport.Connection connection, long requestId, String action,
|
||||
TransportRequest request, TransportRequestOptions options) {
|
||||
// don't send anything, the receiving node is unresponsive
|
||||
toClose.add(connection);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clearCallback() {
|
||||
// close to simulate that tcp-ip eventually times out and closes connection (necessary to ensure transport eventually
|
||||
// responds).
|
||||
try {
|
||||
IOUtils.close(toClose);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -19,21 +19,33 @@
|
|||
|
||||
package org.elasticsearch.test.disruption;
|
||||
|
||||
import org.elasticsearch.action.admin.cluster.health.ClusterHealthAction;
|
||||
import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest;
|
||||
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
|
||||
import org.elasticsearch.cluster.NodeConnectionsService;
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.test.ESIntegTestCase;
|
||||
import org.elasticsearch.test.InternalTestCluster;
|
||||
import org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions;
|
||||
import org.elasticsearch.test.transport.MockTransportService;
|
||||
import org.elasticsearch.threadpool.ThreadPool;
|
||||
import org.elasticsearch.transport.TransportException;
|
||||
import org.elasticsearch.transport.TransportResponse;
|
||||
import org.elasticsearch.transport.TransportResponseHandler;
|
||||
import org.elasticsearch.transport.TransportService;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
|
||||
|
||||
|
@ -102,4 +114,80 @@ public class NetworkDisruptionIT extends ESIntegTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testTransportRespondsEventually() throws InterruptedException {
|
||||
internalCluster().setBootstrapMasterNodeIndex(0);
|
||||
internalCluster().ensureAtLeastNumDataNodes(randomIntBetween(3, 5));
|
||||
final NetworkDisruption.DisruptedLinks disruptedLinks;
|
||||
if (randomBoolean()) {
|
||||
disruptedLinks = TwoPartitions.random(random(), internalCluster().getNodeNames());
|
||||
} else {
|
||||
disruptedLinks = NetworkDisruption.Bridge.random(random(), internalCluster().getNodeNames());
|
||||
}
|
||||
|
||||
NetworkDisruption networkDisruption = new NetworkDisruption(disruptedLinks, randomFrom(new NetworkDisruption.NetworkUnresponsive(),
|
||||
new NetworkDisruption.NetworkDisconnect(), NetworkDisruption.NetworkDelay.random(random())));
|
||||
internalCluster().setDisruptionScheme(networkDisruption);
|
||||
|
||||
networkDisruption.startDisrupting();
|
||||
|
||||
int requests = randomIntBetween(1, 200);
|
||||
CountDownLatch latch = new CountDownLatch(requests);
|
||||
for (int i = 0; i < requests - 1; ++i) {
|
||||
sendRequest(
|
||||
internalCluster().getInstance(TransportService.class), internalCluster().getInstance(TransportService.class),
|
||||
latch);
|
||||
}
|
||||
|
||||
// send a request that is guaranteed disrupted.
|
||||
Tuple<TransportService, TransportService> disruptedPair = findDisruptedPair(disruptedLinks);
|
||||
sendRequest(disruptedPair.v1(), disruptedPair.v2(), latch);
|
||||
|
||||
// give a bit of time to send something under disruption.
|
||||
assertFalse(latch.await(500, TimeUnit.MILLISECONDS)
|
||||
&& networkDisruption.getNetworkLinkDisruptionType() instanceof NetworkDisruption.NetworkDisconnect == false);
|
||||
networkDisruption.stopDisrupting();
|
||||
|
||||
latch.await(30, TimeUnit.SECONDS);
|
||||
assertEquals("All requests must respond, requests: " + requests, 0, latch.getCount());
|
||||
}
|
||||
|
||||
private Tuple<TransportService, TransportService> findDisruptedPair(NetworkDisruption.DisruptedLinks disruptedLinks) {
|
||||
Optional<Tuple<TransportService, TransportService>> disruptedPair = disruptedLinks.nodes().stream()
|
||||
.flatMap(n1 -> disruptedLinks.nodes().stream().map(n2 -> Tuple.tuple(n1, n2)))
|
||||
.filter(pair -> disruptedLinks.disrupt(pair.v1(), pair.v2()))
|
||||
.map(pair -> Tuple.tuple(internalCluster().getInstance(TransportService.class, pair.v1()),
|
||||
internalCluster().getInstance(TransportService.class, pair.v2())))
|
||||
.findFirst();
|
||||
// since we have 3+ nodes, we are sure to find a disrupted pair, also for bridge disruptions.
|
||||
assertTrue(disruptedPair.isPresent());
|
||||
return disruptedPair.get();
|
||||
}
|
||||
|
||||
private void sendRequest(TransportService source, TransportService target, CountDownLatch latch) {
|
||||
source.sendRequest(target.getLocalNode(), ClusterHealthAction.NAME, new ClusterHealthRequest(),
|
||||
new TransportResponseHandler<TransportResponse>() {
|
||||
private AtomicBoolean responded = new AtomicBoolean();
|
||||
@Override
|
||||
public void handleResponse(TransportResponse response) {
|
||||
assertTrue(responded.compareAndSet(false, true));
|
||||
latch.countDown();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handleException(TransportException exp) {
|
||||
assertTrue(responded.compareAndSet(false, true));
|
||||
latch.countDown();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String executor() {
|
||||
return ThreadPool.Names.SAME;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TransportResponse read(StreamInput in) throws IOException {
|
||||
return ClusterHealthResponse.readResponseFrom(in);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue