Fix issue when relocation source and target routings are failed in same batch update

PR #19715 made AllocationService less lenient, requiring ShardRouting instances that are passed to its applyStartedShards and applyFailedShards methods to exist in the routing table. As primary shard failures also fail initializing replica shards, concurrent replica shard failures that are treated in the same cluster state update might not reference existing replica entries in the routing table anymore. To solve this, PR #19715 ordered the failures by first handling replica before primary failures. There are other failures that influence more than one routing entry, however. When we have a failed shard entry for both a relocation source and target, then, depending on the order, either one or the other might point to an out-dated shard entry. As finding a good order is more difficult than applying the failures, this commit re-adds parts of the ShardRouting re-resolve logic so that the applyFailedShards method can properly treat shard failure batches.
2025-03-09 14:34:43 +00:00 · 2016-08-08 11:23:32 +02:00 · 2016-08-08 11:23:32 +02:00 · 180eff14dd
commit 180eff14dd
parent 97dfa2ba40
1 changed files with 18 additions and 12 deletions
--- a/core/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java
@ -235,18 +235,24 @@ public class AllocationService extends AbstractComponent {
        FailedRerouteAllocation allocation = new FailedRerouteAllocation(allocationDeciders, routingNodes, clusterState, failedShards,
            clusterInfoService.getClusterInfo(), currentNanoTime);

-        // as failing primaries also fail associated replicas, we fail replicas first here to avoid re-resolving replica ShardRouting
-        List<FailedRerouteAllocation.FailedShard> orderedFailedShards = new ArrayList<>(failedShards);
-        orderedFailedShards.sort(Comparator.comparing(failedShard -> failedShard.routingEntry.primary()));
-
-        for (FailedRerouteAllocation.FailedShard failedShardEntry : orderedFailedShards) {
-            ShardRouting failedShard = failedShardEntry.routingEntry;
-            final int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0;
-            UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, failedShardEntry.message,
-                failedShardEntry.failure, failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false,
-                AllocationStatus.NO_ATTEMPT);
-            allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
-            applyFailedShard(allocation, failedShard, unassignedInfo);
+        for (FailedRerouteAllocation.FailedShard failedShardEntry : failedShards) {
+            ShardRouting shardToFail = failedShardEntry.routingEntry;
+            allocation.addIgnoreShardForNode(shardToFail.shardId(), shardToFail.currentNodeId());
+            // failing a primary also fails initializing replica shards, re-resolve ShardRouting
+            ShardRouting failedShard = routingNodes.getByAllocationId(shardToFail.shardId(), shardToFail.allocationId().getId());
+            if (failedShard != null) {
+                if (failedShard != shardToFail) {
+                    logger.trace("{} shard routing modified in an earlier iteration (previous: {}, current: {})",
+                        shardToFail.shardId(), shardToFail, failedShard);
+                }
+                int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0;
+                UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, failedShardEntry.message,
+                    failedShardEntry.failure, failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false,
+                    AllocationStatus.NO_ATTEMPT);
+                applyFailedShard(allocation, failedShard, unassignedInfo);
+            } else {
+                logger.trace("{} shard routing failed in an earlier iteration (routing: {})", shardToFail.shardId(), shardToFail);
+            }
        }
        gatewayAllocator.applyFailedShards(allocation);