mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-17 10:25:15 +00:00
[Infra] re-send failed shard messages when receiving a cluster state that still refers to them
In rare cases we may fail to send a shard failure event to the master, or there is no known master when the shard has failed (ex. a couple of node leave the cluster canceling recoveries and causing a master to step down at the same time). When that happens and a cluster state arrives from the (new) master we should resend the shard failure in order for the master to remove the shard from this node. Closes #6881
This commit is contained in:
parent
f1c2cdb9c8
commit
d869163b66
@ -85,8 +85,17 @@ public class ShardStateAction extends AbstractComponent {
|
||||
}
|
||||
|
||||
public void shardFailed(final ShardRouting shardRouting, final String indexUUID, final String reason, final DiscoveryNode masterNode) throws ElasticsearchException {
|
||||
logger.warn("{} sending failed shard for {}, indexUUID [{}], reason [{}]", shardRouting.shardId(), shardRouting, indexUUID, reason);
|
||||
innerShardFailed(shardRouting, indexUUID, reason, masterNode);
|
||||
}
|
||||
|
||||
public void resendShardFailed(final ShardRouting shardRouting, final String indexUUID, final String reason, final DiscoveryNode masterNode) throws ElasticsearchException {
|
||||
logger.trace("{} re-sending failed shard for {}, indexUUID [{}], reason [{}]", shardRouting.shardId(), shardRouting, indexUUID, reason);
|
||||
innerShardFailed(shardRouting, indexUUID, reason, masterNode);
|
||||
}
|
||||
|
||||
private void innerShardFailed(final ShardRouting shardRouting, final String indexUUID, final String reason, final DiscoveryNode masterNode) {
|
||||
ShardRoutingEntry shardRoutingEntry = new ShardRoutingEntry(shardRouting, indexUUID, reason);
|
||||
logger.warn("{} sending failed shard for {}", shardRouting.shardId(), shardRoutingEntry);
|
||||
if (clusterService.localNode().equals(masterNode)) {
|
||||
innerShardFailed(shardRoutingEntry);
|
||||
} else {
|
||||
|
@ -517,7 +517,14 @@ public class IndicesClusterStateService extends AbstractLifecycleComponent<Indic
|
||||
final int shardId = shardRouting.id();
|
||||
|
||||
if (!indexService.hasShard(shardId) && shardRouting.started()) {
|
||||
if (!failedShards.containsKey(shardRouting.shardId())) {
|
||||
if (failedShards.containsKey(shardRouting.shardId())) {
|
||||
if (nodes.masterNode() != null) {
|
||||
shardStateAction.resendShardFailed(shardRouting, indexMetaData.getUUID(),
|
||||
"master " + nodes.masterNode() + " marked shard as started, but shard has previous failed. resending shard failure.",
|
||||
nodes.masterNode()
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// the master thinks we are started, but we don't have this shard at all, mark it as failed
|
||||
logger.warn("[{}][{}] master [{}] marked shard as started, but shard has not been created, mark shard as failed", shardRouting.index(), shardId, nodes.masterNode());
|
||||
failedShards.put(shardRouting.shardId(), new FailedShard(shardRouting.version()));
|
||||
@ -650,8 +657,11 @@ public class IndicesClusterStateService extends AbstractLifecycleComponent<Indic
|
||||
// if there is no shard, create it
|
||||
if (!indexService.hasShard(shardId)) {
|
||||
if (failedShards.containsKey(shardRouting.shardId())) {
|
||||
// already tried to create this shard but it failed - ignore
|
||||
logger.trace("[{}][{}] not initializing, this shards failed to recover on this node before, waiting for reassignment", shardRouting.index(), shardRouting.id());
|
||||
if (nodes.masterNode() != null) {
|
||||
shardStateAction.resendShardFailed(shardRouting, indexMetaData.getUUID(),
|
||||
"master " + nodes.masterNode() + " marked shard as initializing, but shard is marked as failed, resend shard failure",
|
||||
nodes.masterNode());
|
||||
}
|
||||
return;
|
||||
}
|
||||
try {
|
||||
|
Loading…
x
Reference in New Issue
Block a user