[Infra] re-send failed shard messages when receiving a cluster state that still refers to them

In rare cases we may fail to send a shard failure event to the master, or there is no known master when the shard has failed (ex. a couple of node leave the cluster canceling recoveries and causing a master to step down at the same time). When that happens and a cluster state arrives from the (new) master we should resend the shard failure in order for the master to remove the shard from this node.

Closes #6881
This commit is contained in:
Boaz Leskes 2014-07-15 19:56:14 +02:00
parent f1c2cdb9c8
commit d869163b66
2 changed files with 23 additions and 4 deletions

View File

@ -85,8 +85,17 @@ public class ShardStateAction extends AbstractComponent {
}
public void shardFailed(final ShardRouting shardRouting, final String indexUUID, final String reason, final DiscoveryNode masterNode) throws ElasticsearchException {
logger.warn("{} sending failed shard for {}, indexUUID [{}], reason [{}]", shardRouting.shardId(), shardRouting, indexUUID, reason);
innerShardFailed(shardRouting, indexUUID, reason, masterNode);
}
public void resendShardFailed(final ShardRouting shardRouting, final String indexUUID, final String reason, final DiscoveryNode masterNode) throws ElasticsearchException {
logger.trace("{} re-sending failed shard for {}, indexUUID [{}], reason [{}]", shardRouting.shardId(), shardRouting, indexUUID, reason);
innerShardFailed(shardRouting, indexUUID, reason, masterNode);
}
private void innerShardFailed(final ShardRouting shardRouting, final String indexUUID, final String reason, final DiscoveryNode masterNode) {
ShardRoutingEntry shardRoutingEntry = new ShardRoutingEntry(shardRouting, indexUUID, reason);
logger.warn("{} sending failed shard for {}", shardRouting.shardId(), shardRoutingEntry);
if (clusterService.localNode().equals(masterNode)) {
innerShardFailed(shardRoutingEntry);
} else {

View File

@ -517,7 +517,14 @@ public class IndicesClusterStateService extends AbstractLifecycleComponent<Indic
final int shardId = shardRouting.id();
if (!indexService.hasShard(shardId) && shardRouting.started()) {
if (!failedShards.containsKey(shardRouting.shardId())) {
if (failedShards.containsKey(shardRouting.shardId())) {
if (nodes.masterNode() != null) {
shardStateAction.resendShardFailed(shardRouting, indexMetaData.getUUID(),
"master " + nodes.masterNode() + " marked shard as started, but shard has previous failed. resending shard failure.",
nodes.masterNode()
);
}
} else {
// the master thinks we are started, but we don't have this shard at all, mark it as failed
logger.warn("[{}][{}] master [{}] marked shard as started, but shard has not been created, mark shard as failed", shardRouting.index(), shardId, nodes.masterNode());
failedShards.put(shardRouting.shardId(), new FailedShard(shardRouting.version()));
@ -650,8 +657,11 @@ public class IndicesClusterStateService extends AbstractLifecycleComponent<Indic
// if there is no shard, create it
if (!indexService.hasShard(shardId)) {
if (failedShards.containsKey(shardRouting.shardId())) {
// already tried to create this shard but it failed - ignore
logger.trace("[{}][{}] not initializing, this shards failed to recover on this node before, waiting for reassignment", shardRouting.index(), shardRouting.id());
if (nodes.masterNode() != null) {
shardStateAction.resendShardFailed(shardRouting, indexMetaData.getUUID(),
"master " + nodes.masterNode() + " marked shard as initializing, but shard is marked as failed, resend shard failure",
nodes.masterNode());
}
return;
}
try {