Do not ignore ConnectTransportException for shard replication operations
A ConnectTransportException should fail the replica shard Closes #6183
This commit is contained in:
parent
75ed24f6b6
commit
107534c062
|
@ -170,9 +170,6 @@ public abstract class TransportShardReplicationOperationAction<Request extends S
|
|||
return true;
|
||||
}
|
||||
Throwable cause = ExceptionsHelper.unwrapCause(e);
|
||||
if (cause instanceof ConnectTransportException) {
|
||||
return true;
|
||||
}
|
||||
// on version conflict or document missing, it means
|
||||
// that a news change has crept into the replica, and its fine
|
||||
if (cause instanceof VersionConflictEngineException) {
|
||||
|
@ -697,7 +694,7 @@ public abstract class TransportShardReplicationOperationAction<Request extends S
|
|||
@Override
|
||||
public void handleException(TransportException exp) {
|
||||
logger.trace("[{}] Transport failure during replica request [{}] ", exp, node, request);
|
||||
if (!ignoreReplicaException(exp.unwrapCause())) {
|
||||
if (!ignoreReplicaException(exp)) {
|
||||
logger.warn("Failed to perform " + transportAction + " on remote replica " + node + shardIt.shardId(), exp);
|
||||
shardStateAction.shardFailed(shard, indexMetaData.getUUID(),
|
||||
"Failed to perform [" + transportAction + "] on replica, message [" + ExceptionsHelper.detailedMessage(exp) + "]");
|
||||
|
|
|
@ -0,0 +1,162 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index;
|
||||
|
||||
import com.google.common.base.Predicate;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus;
|
||||
import org.elasticsearch.action.index.IndexResponse;
|
||||
import org.elasticsearch.cluster.ClusterState;
|
||||
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
|
||||
import org.elasticsearch.cluster.routing.MutableShardRouting;
|
||||
import org.elasticsearch.cluster.routing.RoutingNodes;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.discovery.Discovery;
|
||||
import org.elasticsearch.discovery.DiscoverySettings;
|
||||
import org.elasticsearch.test.ElasticsearchIntegrationTest;
|
||||
import org.elasticsearch.test.junit.annotations.TestLogging;
|
||||
import org.elasticsearch.test.transport.MockTransportService;
|
||||
import org.elasticsearch.transport.TransportModule;
|
||||
import org.elasticsearch.transport.TransportService;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.elasticsearch.cluster.routing.ShardRoutingState.*;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
/**
|
||||
* Test failure when index replication actions fail mid-flight
|
||||
*/
|
||||
@ElasticsearchIntegrationTest.ClusterScope(scope = ElasticsearchIntegrationTest.Scope.TEST, numDataNodes = 0, transportClientRatio = 0)
|
||||
@TestLogging("discovery.zen:TRACE,action.index:TRACE,indices.recovery:TRACE")
|
||||
public class TransportIndexFailuresTest extends ElasticsearchIntegrationTest {
|
||||
|
||||
private static final Settings nodeSettings = ImmutableSettings.settingsBuilder()
|
||||
.put("discovery.type", "zen") // <-- To override the local setting if set externally
|
||||
.put("discovery.zen.fd.ping_timeout", "1s") // <-- for hitting simulated network failures quickly
|
||||
.put("discovery.zen.fd.ping_retries", "1") // <-- for hitting simulated network failures quickly
|
||||
.put(DiscoverySettings.PUBLISH_TIMEOUT, "1s") // <-- for hitting simulated network failures quickly
|
||||
.put("discovery.zen.minimum_master_nodes", 1)
|
||||
.put(TransportModule.TRANSPORT_SERVICE_TYPE_KEY, MockTransportService.class.getName())
|
||||
.build();
|
||||
|
||||
@Override
|
||||
protected int numberOfShards() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int numberOfReplicas() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNetworkPartitionDuringReplicaIndexOp() throws Exception {
|
||||
final String INDEX = "testidx";
|
||||
|
||||
List<String> nodes = internalCluster().startNodesAsync(2, nodeSettings).get();
|
||||
|
||||
// Create index test with 1 shard, 1 replica and ensure it is green
|
||||
createIndex(INDEX);
|
||||
ensureGreen(INDEX);
|
||||
|
||||
// Disable allocation so the replica cannot be reallocated when it fails
|
||||
Settings s = ImmutableSettings.builder().put("cluster.routing.allocation.enable", "none").build();
|
||||
client().admin().cluster().prepareUpdateSettings().setTransientSettings(s).get();
|
||||
|
||||
// Determine which node holds the primary shard
|
||||
ClusterState state = getNodeClusterState(nodes.get(0));
|
||||
IndexShardRoutingTable shard = state.getRoutingTable().index(INDEX).shard(0);
|
||||
String primaryNode;
|
||||
String replicaNode;
|
||||
if (shard.getShards().get(0).primary()) {
|
||||
primaryNode = nodes.get(0);
|
||||
replicaNode = nodes.get(1);
|
||||
} else {
|
||||
primaryNode = nodes.get(1);
|
||||
replicaNode = nodes.get(0);
|
||||
}
|
||||
logger.info("--> primary shard is on {}", primaryNode);
|
||||
|
||||
// Index a document to make sure everything works well
|
||||
IndexResponse resp = internalCluster().client(primaryNode).prepareIndex(INDEX, "doc").setSource("foo", "bar").get();
|
||||
assertThat("document exists on primary node",
|
||||
internalCluster().client(primaryNode).prepareGet(INDEX, "doc", resp.getId()).setPreference("_only_local").get().isExists(),
|
||||
equalTo(true));
|
||||
assertThat("document exists on replica node",
|
||||
internalCluster().client(replicaNode).prepareGet(INDEX, "doc", resp.getId()).setPreference("_only_local").get().isExists(),
|
||||
equalTo(true));
|
||||
|
||||
// Disrupt the network so indexing requests fail to replicate
|
||||
logger.info("--> preventing index/replica operations");
|
||||
TransportService mockTransportService = internalCluster().getInstance(TransportService.class, primaryNode);
|
||||
((MockTransportService) mockTransportService).addFailToSendNoConnectRule(
|
||||
internalCluster().getInstance(Discovery.class, replicaNode).localNode(),
|
||||
ImmutableSet.of("index/replica")
|
||||
);
|
||||
mockTransportService = internalCluster().getInstance(TransportService.class, replicaNode);
|
||||
((MockTransportService) mockTransportService).addFailToSendNoConnectRule(
|
||||
internalCluster().getInstance(Discovery.class, primaryNode).localNode(),
|
||||
ImmutableSet.of("index/replica")
|
||||
);
|
||||
|
||||
logger.info("--> indexing into primary");
|
||||
// the replica shard should now be marked as failed because the replication operation will fail
|
||||
resp = internalCluster().client(primaryNode).prepareIndex(INDEX, "doc").setSource("foo", "baz").get();
|
||||
// wait until the cluster reaches an exact yellow state, meaning replica has failed
|
||||
assertBusy(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
assertThat(client().admin().cluster().prepareHealth().get().getStatus(), equalTo(ClusterHealthStatus.YELLOW));
|
||||
}
|
||||
});
|
||||
assertThat("document should still be indexed and available",
|
||||
client().prepareGet(INDEX, "doc", resp.getId()).get().isExists(), equalTo(true));
|
||||
|
||||
state = getNodeClusterState(randomFrom(nodes.toArray(Strings.EMPTY_ARRAY)));
|
||||
RoutingNodes rn = state.routingNodes();
|
||||
logger.info("--> counts: total: {}, unassigned: {}, initializing: {}, relocating: {}, started: {}",
|
||||
rn.shards(new Predicate<MutableShardRouting>() {
|
||||
@Override
|
||||
public boolean apply(org.elasticsearch.cluster.routing.MutableShardRouting input) {
|
||||
return true;
|
||||
}
|
||||
}).size(),
|
||||
rn.shardsWithState(UNASSIGNED).size(),
|
||||
rn.shardsWithState(INITIALIZING).size(),
|
||||
rn.shardsWithState(RELOCATING).size(),
|
||||
rn.shardsWithState(STARTED).size());
|
||||
logger.info("--> unassigned: {}, initializing: {}, relocating: {}, started: {}",
|
||||
rn.shardsWithState(UNASSIGNED),
|
||||
rn.shardsWithState(INITIALIZING),
|
||||
rn.shardsWithState(RELOCATING),
|
||||
rn.shardsWithState(STARTED));
|
||||
|
||||
assertThat("only a single shard is now active (replica should be failed and not reallocated)",
|
||||
rn.shardsWithState(STARTED).size(), equalTo(1));
|
||||
}
|
||||
|
||||
private ClusterState getNodeClusterState(String node) {
|
||||
return internalCluster().client(node).admin().cluster().prepareState().setLocal(true).get().getState();
|
||||
}
|
||||
}
|
|
@ -32,6 +32,7 @@ import org.elasticsearch.threadpool.ThreadPool;
|
|||
import org.elasticsearch.transport.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
/**
|
||||
|
@ -91,6 +92,33 @@ public class MockTransportService extends TransportService {
|
|||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a rule that will cause matching operations to throw ConnectTransportExceptions
|
||||
*/
|
||||
public void addFailToSendNoConnectRule(DiscoveryNode node, final Set<String> blockedActions) {
|
||||
|
||||
((LookupTestTransport) transport).transports.put(node, new DelegateTransport(original) {
|
||||
@Override
|
||||
public void connectToNode(DiscoveryNode node) throws ConnectTransportException {
|
||||
original.connectToNode(node);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void connectToNodeLight(DiscoveryNode node) throws ConnectTransportException {
|
||||
original.connectToNodeLight(node);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void sendRequest(DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException {
|
||||
if (blockedActions.contains(action)) {
|
||||
logger.info("--> preventing {} request", action);
|
||||
throw new ConnectTransportException(node, "DISCONNECT: prevented " + action + " request");
|
||||
}
|
||||
original.sendRequest(node, requestId, action, request, options);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a rule that will cause ignores each send request, simulating an unresponsive node
|
||||
* and failing to connect once the rule was added.
|
||||
|
|
Loading…
Reference in New Issue