Make allocation decisions at node level first for pending task optimi… (#534) (#739)

* Make allocation decisions at node level first for pending task optimization Signed-off-by: Ankit Jain <akjain@amazon.com> * Addressing review comments Signed-off-by: Ankit Jain <akjain@amazon.com> * Fixing benchmark and adding debug mode tests Signed-off-by: Ankit Jain <akjain@amazon.com> * Fixing typo in previous commit Signed-off-by: Ankit Jain <akjain@amazon.com> * Moving test file to correct package Signed-off-by: Ankit Jain <akjain@amazon.com> * Addressing review comments Signed-off-by: Ankit Jain <akjain@amazon.com>
2025-03-09 14:34:43 +00:00 · 2021-05-21 02:32:18 +05:30 · 2021-05-21 02:32:18 +05:30 · edbe5ae7e8
commit edbe5ae7e8
parent e90bde5a05
13 changed files with 685 additions and 80 deletions
--- a/benchmarks/src/main/java/org/opensearch/benchmark/routing/allocation/AllocationBenchmark.java
+++ b/benchmarks/src/main/java/org/opensearch/benchmark/routing/allocation/AllocationBenchmark.java
@ -31,16 +31,6 @@

 package org.opensearch.benchmark.routing.allocation;

-import org.opensearch.Version;
-import org.opensearch.cluster.ClusterName;
-import org.opensearch.cluster.ClusterState;
-import org.opensearch.cluster.metadata.IndexMetadata;
-import org.opensearch.cluster.metadata.Metadata;
-import org.opensearch.cluster.node.DiscoveryNodes;
-import org.opensearch.cluster.routing.RoutingTable;
-import org.opensearch.cluster.routing.ShardRoutingState;
-import org.opensearch.cluster.routing.allocation.AllocationService;
-import org.opensearch.common.settings.Settings;
 import org.openjdk.jmh.annotations.Benchmark;
 import org.openjdk.jmh.annotations.BenchmarkMode;
 import org.openjdk.jmh.annotations.Fork;
@ -52,8 +42,20 @@ import org.openjdk.jmh.annotations.Scope;
 import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
 import org.openjdk.jmh.annotations.Warmup;
+import org.opensearch.Version;
+import org.opensearch.cluster.ClusterName;
+import org.opensearch.cluster.ClusterState;
+import org.opensearch.cluster.metadata.IndexMetadata;
+import org.opensearch.cluster.metadata.Metadata;
+import org.opensearch.cluster.node.DiscoveryNodes;
+import org.opensearch.cluster.routing.RoutingTable;
+import org.opensearch.cluster.routing.ShardRouting;
+import org.opensearch.cluster.routing.ShardRoutingState;
+import org.opensearch.cluster.routing.allocation.AllocationService;
+import org.opensearch.common.settings.Settings;

-import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
 import java.util.concurrent.TimeUnit;

@Fork(3)
@ -71,75 +73,103 @@ public class AllocationBenchmark {
    // support to constrain the combinations of benchmark parameters and we do not want to rely on OptionsBuilder as each benchmark would
    // need its own main method and we cannot execute more than one class with a main method per JAR.
    @Param({
-        // indices| shards| replicas| nodes
-        "       10|      1|        0|     1",
-        "       10|      3|        0|     1",
-        "       10|     10|        0|     1",
-        "      100|      1|        0|     1",
-        "      100|      3|        0|     1",
-        "      100|     10|        0|     1",
+        // indices| shards| replicas| source| target| concurrentRecoveries
+        "       10|      2|        0|       1|      1|      1|",
+        "       10|      3|        0|       1|      1|      2|",
+        "       10|     10|        0|       1|      1|      5|",
+        "      100|      1|        0|       1|      1|     10|",
+        "      100|      3|        0|       1|      1|     10|",
+        "      100|     10|        0|       1|      1|     10|",

-        "       10|      1|        0|    10",
-        "       10|      3|        0|    10",
-        "       10|     10|        0|    10",
-        "      100|      1|        0|    10",
-        "      100|      3|        0|    10",
-        "      100|     10|        0|    10",
+        "       10|      2|        0|      10|     10|      1|",
+        "       10|      3|        0|      10|      5|      2|",
+        "       10|     10|        0|      10|      5|      5|",
+        "      100|      1|        0|       5|     10|      5|",
+        "      100|      3|        0|      10|      5|      5|",
+        "      100|     10|        0|      10|     20|      6|",

-        "       10|      1|        1|    10",
-        "       10|      3|        1|    10",
-        "       10|     10|        1|    10",
-        "      100|      1|        1|    10",
-        "      100|      3|        1|    10",
-        "      100|     10|        1|    10",
+        "       10|      1|        1|      10|     10|      1|",
+        "       10|      3|        1|      10|      3|      3|",
+        "       10|     10|        1|       5|     12|      5|",
+        "      100|      1|        1|      10|     10|      6|",
+        "      100|      3|        1|      10|      5|      8|",
+        "      100|     10|        1|       8|     17|      8|",

-        "       10|      1|        2|    10",
-        "       10|      3|        2|    10",
-        "       10|     10|        2|    10",
-        "      100|      1|        2|    10",
-        "      100|      3|        2|    10",
-        "      100|     10|        2|    10",
+        "       10|      1|        2|      10|     10|      1|",
+        "       10|      3|        2|      10|      5|      3|",
+        "       10|     10|        2|       5|     10|      5|",
+        "      100|      1|        2|      10|      8|      7|",
+        "      100|      3|        2|      13|     17|      5|",
+        "      100|     10|        2|      10|     20|      8|",

-        "       10|      1|        0|    50",
-        "       10|      3|        0|    50",
-        "       10|     10|        0|    50",
-        "      100|      1|        0|    50",
-        "      100|      3|        0|    50",
-        "      100|     10|        0|    50",
+        "       10|      2|        1|      20|     20|      1|",
+        "       10|      3|        1|      20|     30|      1|",
+        "       10|     10|        1|      20|     10|      3|",
+        "      100|      1|        1|      20|      5|      5|",
+        "      100|      3|        1|      20|     23|      6|",
+        "      100|     10|        1|      40|     20|      8|",

-        "       10|      1|        1|    50",
-        "       10|      3|        1|    50",
-        "       10|     10|        1|    50",
-        "      100|      1|        1|    50",
-        "      100|      3|        1|    50",
-        "      100|     10|        1|    50",
+        "       10|      3|        2|      50|     30|      1|",
+        "       10|      3|        2|      50|     25|      1|",
+        "       10|     10|        1|      50|     33|      2|",
+        "      100|      1|        1|      40|     50|      2|",
+        "      100|      3|        1|      50|     70|      3|",
+        "      100|     10|        1|      60|     50|      3|",

-        "       10|      1|        2|    50",
-        "       10|      3|        2|    50",
-        "       10|     10|        2|    50",
-        "      100|      1|        2|    50",
-        "      100|      3|        2|    50",
-        "      100|     10|        2|    50" })
-    public String indicesShardsReplicasNodes = "10|1|0|1";
+        "       10|     10|        2|      50|     50|      1|",
+        "       10|      3|        2|      50|     30|      1|",
+        "       10|     10|        2|      50|     40|      2|",
+        "      100|      1|        2|      40|     50|      2|",
+        "      100|      3|        2|      50|     30|      6|",
+        "      100|     10|        2|      33|     55|      6|",
+
+        "       500|     60|       1|     100|    100|     12|",
+        "       500|     60|       1|     100|     40|     12|",
+        "       500|     60|       1|      40|    100|     12|",
+
+        "       50|      60|       1|     100|    100|      6|",
+        "       50|      60|       1|     100|     40|      6|",
+        "       50|      60|       1|      40|    100|      6|" })
+    public String indicesShardsReplicasSourceTargetRecoveries = "10|1|0|1|1|1";

    public int numTags = 2;
+    public int numZone = 3;
+    public int concurrentRecoveries;
+    public int numIndices;
+    public int numShards;
+    public int numReplicas;
+    public int sourceNodes;
+    public int targetNodes;
+    public int clusterConcurrentRecoveries;

-    private AllocationService strategy;
+    private AllocationService initialClusterStrategy;
+    private AllocationService clusterExcludeStrategy;
+    private AllocationService clusterZoneAwareExcludeStrategy;
    private ClusterState initialClusterState;

    @Setup
    public void setUp() throws Exception {
-        final String[] params = indicesShardsReplicasNodes.split("\\|");
+        final String[] params = indicesShardsReplicasSourceTargetRecoveries.split("\\|");
+        numIndices = toInt(params[0]);
+        numShards = toInt(params[1]);
+        numReplicas = toInt(params[2]);
+        sourceNodes = toInt(params[3]);
+        targetNodes = toInt(params[4]);
+        concurrentRecoveries = toInt(params[5]);

-        int numIndices = toInt(params[0]);
-        int numShards = toInt(params[1]);
-        int numReplicas = toInt(params[2]);
-        int numNodes = toInt(params[3]);
+        int totalShardCount = (numReplicas + 1) * numShards * numIndices;

-        strategy = Allocators.createAllocationService(
-            Settings.builder().put("cluster.routing.allocation.awareness.attributes", "tag").build()
+        initialClusterStrategy = Allocators.createAllocationService(
+            Settings.builder()
+                .put("cluster.routing.allocation.awareness.attributes", "zone")
+                .put("cluster.routing.allocation.node_concurrent_recoveries", "20")
+                .put("cluster.routing.allocation.exclude.tag", "tag_0")
+                .build()
        );

+        // We'll try to move nodes from tag_1 to tag_0
+        clusterConcurrentRecoveries = Math.min(sourceNodes, targetNodes) * concurrentRecoveries;
+
        Metadata.Builder mb = Metadata.builder();
        for (int i = 1; i <= numIndices; i++) {
            mb.put(
@ -155,15 +185,37 @@ public class AllocationBenchmark {
            rb.addAsNew(metadata.index("test_" + i));
        }
        RoutingTable routingTable = rb.build();
-        DiscoveryNodes.Builder nb = DiscoveryNodes.builder();
-        for (int i = 1; i <= numNodes; i++) {
-            nb.add(Allocators.newNode("node" + i, Collections.singletonMap("tag", "tag_" + (i % numTags))));
-        }
        initialClusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
            .metadata(metadata)
            .routingTable(routingTable)
-            .nodes(nb)
+            .nodes(setUpClusterNodes(sourceNodes, targetNodes))
            .build();
+        // Start all unassigned shards
+        initialClusterState = initialClusterStrategy.reroute(initialClusterState, "reroute");
+        while (initialClusterState.getRoutingNodes().hasUnassignedShards()) {
+            initialClusterState = initialClusterStrategy.applyStartedShards(
+                initialClusterState,
+                initialClusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING)
+            );
+            initialClusterState = initialClusterStrategy.reroute(initialClusterState, "reroute");
+        }
+        // Ensure all shards are started
+        while (initialClusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size() > 0) {
+            initialClusterState = initialClusterStrategy.applyStartedShards(
+                initialClusterState,
+                initialClusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING)
+            );
+        }
+
+        assert (initialClusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size() == totalShardCount);
+        assert (initialClusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size() == 0);
+        assert (initialClusterState.getRoutingNodes().shardsWithState(ShardRoutingState.RELOCATING).size() == 0);
+        // make sure shards are only allocated on tag1
+        for (ShardRouting startedShard : initialClusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED)) {
+            assert (initialClusterState.getRoutingNodes().node(startedShard.currentNodeId()).node().getAttributes().get("tag")).equals(
+                "tag_1"
+            );
+        }
    }

    private int toInt(String v) {
@ -171,15 +223,58 @@ public class AllocationBenchmark {
    }

    @Benchmark
-    public ClusterState measureAllocation() {
+    public ClusterState measureExclusionOnZoneAwareStartedShard() throws Exception {
        ClusterState clusterState = initialClusterState;
-        while (clusterState.getRoutingNodes().hasUnassignedShards()) {
-            clusterState = strategy.applyStartedShards(
+        clusterZoneAwareExcludeStrategy = Allocators.createAllocationService(
+            Settings.builder()
+                .put("cluster.routing.allocation.awareness.attributes", "zone")
+                .put("cluster.routing.allocation.cluster_concurrent_recoveries", String.valueOf(clusterConcurrentRecoveries))
+                .put("cluster.routing.allocation.node_concurrent_recoveries", String.valueOf(concurrentRecoveries))
+                .put("cluster.routing.allocation.exclude.tag", "tag_1")
+                .build()
+        );
+        clusterState = clusterZoneAwareExcludeStrategy.reroute(clusterState, "reroute");
+        return clusterState;
+    }
+
+    @Benchmark
+    public ClusterState measureShardRelocationComplete() throws Exception {
+        ClusterState clusterState = initialClusterState;
+        clusterZoneAwareExcludeStrategy = Allocators.createAllocationService(
+            Settings.builder()
+                .put("cluster.routing.allocation.awareness.attributes", "zone")
+                .put("cluster.routing.allocation.node_concurrent_recoveries", String.valueOf(concurrentRecoveries))
+                .put("cluster.routing.allocation.cluster_concurrent_recoveries", String.valueOf(clusterConcurrentRecoveries))
+                .put("cluster.routing.allocation.exclude.tag", "tag_1")
+                .build()
+        );
+        clusterState = clusterZoneAwareExcludeStrategy.reroute(clusterState, "reroute");
+        while (clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size() > 0) {
+            clusterState = clusterZoneAwareExcludeStrategy.applyStartedShards(
                clusterState,
                clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING)
            );
-            clusterState = strategy.reroute(clusterState, "reroute");
+        }
+        for (ShardRouting startedShard : clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED)) {
+            assert (clusterState.getRoutingNodes().node(startedShard.currentNodeId()).node().getAttributes().get("tag")).equals("tag_0");
        }
        return clusterState;
    }
+
+    private DiscoveryNodes.Builder setUpClusterNodes(int sourceNodes, int targetNodes) {
+        DiscoveryNodes.Builder nb = DiscoveryNodes.builder();
+        for (int i = 1; i <= sourceNodes; i++) {
+            Map<String, String> attributes = new HashMap<>();
+            attributes.put("tag", "tag_" + 1);
+            attributes.put("zone", "zone_" + (i % numZone));
+            nb.add(Allocators.newNode("node_s_" + i, attributes));
+        }
+        for (int j = 1; j <= targetNodes; j++) {
+            Map<String, String> attributes = new HashMap<>();
+            attributes.put("tag", "tag_" + 0);
+            attributes.put("zone", "zone_" + (j % numZone));
+            nb.add(Allocators.newNode("node_t_" + j, attributes));
+        }
+        return nb;
+    }
 }
--- a/server/src/main/java/org/opensearch/cluster/ClusterModule.java
+++ b/server/src/main/java/org/opensearch/cluster/ClusterModule.java
@ -57,6 +57,7 @@ import org.opensearch.cluster.routing.allocation.decider.AllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.AllocationDeciders;
 import org.opensearch.cluster.routing.allocation.decider.AwarenessAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider;
+import org.opensearch.cluster.routing.allocation.decider.ConcurrentRecoveriesAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.ConcurrentRebalanceAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.DiskThresholdDecider;
 import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider;
@ -244,6 +245,7 @@ public class ClusterModule extends AbstractModule {
        addAllocationDecider(deciders, new RebalanceOnlyWhenActiveAllocationDecider());
        addAllocationDecider(deciders, new ClusterRebalanceAllocationDecider(settings, clusterSettings));
        addAllocationDecider(deciders, new ConcurrentRebalanceAllocationDecider(settings, clusterSettings));
+        addAllocationDecider(deciders, new ConcurrentRecoveriesAllocationDecider(settings, clusterSettings));
        addAllocationDecider(deciders, new EnableAllocationDecider(settings, clusterSettings));
        addAllocationDecider(deciders, new NodeVersionAllocationDecider());
        addAllocationDecider(deciders, new SnapshotInProgressAllocationDecider());
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
@ -256,6 +256,7 @@ public class BalancedShardsAllocator implements ShardsAllocator {
        private final Metadata metadata;
        private final float avgShardsPerNode;
        private final NodeSorter sorter;
+        private final Set<RoutingNode> inEligibleTargetNode;

        public Balancer(Logger logger, RoutingAllocation allocation, WeightFunction weight, float threshold) {
            this.logger = logger;
@ -267,6 +268,7 @@ public class BalancedShardsAllocator implements ShardsAllocator {
            avgShardsPerNode = ((float) metadata.getTotalNumberOfShards()) / routingNodes.size();
            nodes = Collections.unmodifiableMap(buildModelFromAssigned());
            sorter = newNodeSorter();
+            inEligibleTargetNode = new HashSet<>();
        }

        /**
@ -632,6 +634,16 @@ public class BalancedShardsAllocator implements ShardsAllocator {
            return indices;
        }

+        /**
+         * Checks if target node is ineligible and if so, adds to the list
+         * of ineligible target nodes
+         */
+        private void checkAndAddInEligibleTargetNode(RoutingNode targetNode) {
+            Decision nodeLevelAllocationDecision = allocation.deciders().canAllocateAnyShardToNode(targetNode, allocation);
+            if (nodeLevelAllocationDecision.type() != Decision.Type.YES) {
+                inEligibleTargetNode.add(targetNode);
+            }
+        }
        /**
         * Move started shards that can not be allocated to a node anymore
         *
@ -646,8 +658,37 @@ public class BalancedShardsAllocator implements ShardsAllocator {
            // Iterate over the started shards interleaving between nodes, and check if they can remain. In the presence of throttling
            // shard movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are
            // offloading the shards.
+
+            // Trying to eliminate target nodes so that we donot unnecessarily iterate over source nodes
+            // when no target is eligible
+            for (ModelNode currentNode : sorter.modelNodes) {
+                checkAndAddInEligibleTargetNode(currentNode.getRoutingNode());
+            }
            for (Iterator<ShardRouting> it = allocation.routingNodes().nodeInterleavedShardIterator(); it.hasNext(); ) {
+                //Verify if the cluster concurrent recoveries have been reached.
+                if (allocation.deciders().canMoveAnyShard(allocation).type() != Decision.Type.YES) {
+                    logger.info("Cannot move any shard in the cluster due to cluster concurrent recoveries getting breached"
+                                    + ". Skipping shard iteration");
+                    return;
+                }
+                //Early terminate node interleaved shard iteration when no eligible target nodes are available
+                if(sorter.modelNodes.length == inEligibleTargetNode.size()) {
+                    logger.info("Cannot move any shard in the cluster as there is no node on which shards can be allocated"
+                                    + ". Skipping shard iteration");
+                    return;
+                }
+
                ShardRouting shardRouting = it.next();
+
+                // Verify if the shard is allowed to move if outgoing recovery on the node hosting the primary shard
+                // is not being throttled.
+                Decision canMoveAwayDecision = allocation.deciders().canMoveAway(shardRouting, allocation);
+                if(canMoveAwayDecision.type() != Decision.Type.YES) {
+                    if (logger.isDebugEnabled())
+                        logger.debug("Cannot move away shard [{}] Skipping this shard", shardRouting);
+                    continue;
+                }
+
                final MoveDecision moveDecision = decideMove(shardRouting);
                if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) {
                    final ModelNode sourceNode = nodes.get(shardRouting.currentNodeId());
@ -660,6 +701,11 @@ public class BalancedShardsAllocator implements ShardsAllocator {
                    if (logger.isTraceEnabled()) {
                        logger.trace("Moved shard [{}] to node [{}]", shardRouting, targetNode.getRoutingNode());
                    }
+
+                    // Verifying if this node can be considered ineligible for further iterations
+                    if (targetNode != null) {
+                        checkAndAddInEligibleTargetNode(targetNode.getRoutingNode());
+                    }
                } else if (moveDecision.isDecisionTaken() && moveDecision.canRemain() == false) {
                    logger.trace("[{}][{}] can't move", shardRouting.index(), shardRouting.id());
                }
@ -704,9 +750,22 @@ public class BalancedShardsAllocator implements ShardsAllocator {
            RoutingNode targetNode = null;
            final List<NodeAllocationResult> nodeExplanationMap = explain ? new ArrayList<>() : null;
            int weightRanking = 0;
+            int targetNodeProcessed = 0;
            for (ModelNode currentNode : sorter.modelNodes) {
                if (currentNode != sourceNode) {
                    RoutingNode target = currentNode.getRoutingNode();
+                    if(!explain && inEligibleTargetNode.contains(target))
+                        continue;
+                    // don't use canRebalance as we want hard filtering rules to apply. See #17698
+                    if (!explain) {
+                        // If we cannot allocate any shard to node marking it in eligible
+                        Decision nodeLevelAllocationDecision = allocation.deciders().canAllocateAnyShardToNode(target, allocation);
+                        if (nodeLevelAllocationDecision.type() != Decision.Type.YES) {
+                            inEligibleTargetNode.add(currentNode.getRoutingNode());
+                            continue;
+                        }
+                    }
+                    targetNodeProcessed++;
                    // don't use canRebalance as we want hard filtering rules to apply. See #17698
                    Decision allocationDecision = allocation.deciders().canAllocate(shardRouting, target, allocation);
                    if (explain) {
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AllocationDecider.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AllocationDecider.java
@ -128,4 +128,32 @@ public abstract class AllocationDecider {
            return decision;
        }
    }
+
+    /**
+     * Returns a {@link Decision} whether the given shard can be moved away from the current node
+     * {@link RoutingAllocation}. The default is {@link Decision#ALWAYS}.
+     */
+    public Decision canMoveAway(ShardRouting shardRouting, RoutingAllocation allocation) {
+        return Decision.ALWAYS;
+    }
+
+    /**
+     * Returns a {@link Decision} whether any shard in the cluster can be moved away from the current node
+     * {@link RoutingAllocation}. The default is {@link Decision#ALWAYS}.
+     */
+    public Decision canMoveAnyShard(RoutingAllocation allocation) {
+        return Decision.ALWAYS;
+    }
+
+    /**
+     * Returns a {@link Decision} whether any shard on the given
+     * {@link RoutingNode}} can be allocated The default is {@link Decision#ALWAYS}.
+     * All implementations that override this behaviour must take a
+     * {@link Decision}} whether or not to skip iterating over the remaining
+     * deciders for this node.
+     */
+    public Decision canAllocateAnyShardToNode(RoutingNode node, RoutingAllocation allocation) {
+        return Decision.ALWAYS;
+    }
+
 }
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AllocationDeciders.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AllocationDeciders.java
@ -64,7 +64,7 @@ public class AllocationDeciders extends AllocationDecider {
            Decision decision = allocationDecider.canRebalance(shardRouting, allocation);
            // short track if a NO is returned.
            if (decision == Decision.NO) {
-                if (!allocation.debugDecision()) {
+                if (allocation.debugDecision() == false) {
                    return decision;
                } else {
                    ret.add(decision);
@ -91,7 +91,7 @@ public class AllocationDeciders extends AllocationDecider {
                        shardRouting, node.node(), allocationDecider.getClass().getSimpleName());
                }
                // short circuit only if debugging is not enabled
-                if (!allocation.debugDecision()) {
+                if (allocation.debugDecision() == false) {
                    return decision;
                } else {
                    ret.add(decision);
@ -120,7 +120,7 @@ public class AllocationDeciders extends AllocationDecider {
                    logger.trace("Shard [{}] can not remain on node [{}] due to [{}]",
                        shardRouting, node.nodeId(), allocationDecider.getClass().getSimpleName());
                }
-                if (!allocation.debugDecision()) {
+                if (allocation.debugDecision() == false) {
                    return decision;
                } else {
                    ret.add(decision);
@ -139,7 +139,7 @@ public class AllocationDeciders extends AllocationDecider {
            Decision decision = allocationDecider.canAllocate(indexMetadata, node, allocation);
            // short track if a NO is returned.
            if (decision == Decision.NO) {
-                if (!allocation.debugDecision()) {
+                if (allocation.debugDecision() == false) {
                    return decision;
                } else {
                    ret.add(decision);
@ -158,7 +158,7 @@ public class AllocationDeciders extends AllocationDecider {
            Decision decision = allocationDecider.shouldAutoExpandToNode(indexMetadata, node, allocation);
            // short track if a NO is returned.
            if (decision == Decision.NO) {
-                if (!allocation.debugDecision()) {
+                if (allocation.debugDecision() == false) {
                    return decision;
                } else {
                    ret.add(decision);
@ -177,7 +177,7 @@ public class AllocationDeciders extends AllocationDecider {
            Decision decision = allocationDecider.canAllocate(shardRouting, allocation);
            // short track if a NO is returned.
            if (decision == Decision.NO) {
-                if (!allocation.debugDecision()) {
+                if (allocation.debugDecision() == false) {
                    return decision;
                } else {
                    ret.add(decision);
@ -196,7 +196,7 @@ public class AllocationDeciders extends AllocationDecider {
            Decision decision = allocationDecider.canRebalance(allocation);
            // short track if a NO is returned.
            if (decision == Decision.NO) {
-                if (!allocation.debugDecision()) {
+                if (allocation.debugDecision() == false) {
                    return decision;
                } else {
                    ret.add(decision);
@ -224,7 +224,70 @@ public class AllocationDeciders extends AllocationDecider {
                    logger.trace("Shard [{}] can not be forcefully allocated to node [{}] due to [{}].",
                        shardRouting.shardId(), node.nodeId(), decider.getClass().getSimpleName());
                }
-                if (!allocation.debugDecision()) {
+                if (allocation.debugDecision() == false) {
+                    return decision;
+                } else {
+                    ret.add(decision);
+                }
+            } else {
+                addDecision(ret, decision, allocation);
+            }
+        }
+        return ret;
+    }
+
+    @Override
+    public Decision canAllocateAnyShardToNode(RoutingNode node, RoutingAllocation allocation) {
+        Decision.Multi ret = new Decision.Multi();
+        for (AllocationDecider decider : allocations) {
+            Decision decision = decider.canAllocateAnyShardToNode(node, allocation);
+            if (decision.type().canPremptivelyReturn()) {
+                if (logger.isTraceEnabled()) {
+                    logger.trace("Shard can not be allocated on node [{}] due to [{}]", node.nodeId(), decider.getClass().getSimpleName());
+                }
+                if (allocation.debugDecision() == false) {
+                    return decision;
+                } else {
+                    ret.add(decision);
+                }
+            } else {
+                addDecision(ret, decision, allocation);
+            }
+        }
+        return ret;
+    }
+
+
+    @Override
+    public Decision canMoveAway(ShardRouting shardRouting, RoutingAllocation allocation) {
+        Decision.Multi ret = new Decision.Multi();
+        for (AllocationDecider decider : allocations) {
+            Decision decision = decider.canMoveAway(shardRouting, allocation);
+            // short track if a NO is returned.
+            if (decision.type().canPremptivelyReturn()) {
+                if (logger.isTraceEnabled()) {
+                    logger.trace("Shard [{}] can not be moved away due to [{}]", shardRouting, decider.getClass().getSimpleName());
+                }
+                if (allocation.debugDecision() == false) {
+                    return decision;
+                } else {
+                    ret.add(decision);
+                }
+            } else {
+                addDecision(ret, decision, allocation);
+            }
+        }
+        return ret;
+    }
+
+    @Override
+    public Decision canMoveAnyShard(RoutingAllocation allocation) {
+        Decision.Multi ret = new Decision.Multi();
+        for (AllocationDecider decider : allocations) {
+            Decision decision = decider.canMoveAnyShard(allocation);
+            // short track if a NO is returned.
+            if (decision.type().canPremptivelyReturn()) {
+                if (allocation.debugDecision() == false) {
                    return decision;
                } else {
                    ret.add(decision);
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/ConcurrentRecoveriesAllocationDecider.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/ConcurrentRecoveriesAllocationDecider.java
@ -0,0 +1,82 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+package org.opensearch.cluster.routing.allocation.decider;
+
+import org.opensearch.cluster.routing.RoutingNode;
+import org.opensearch.cluster.routing.ShardRouting;
+import org.opensearch.cluster.routing.allocation.RoutingAllocation;
+import org.opensearch.common.settings.ClusterSettings;
+import org.opensearch.common.settings.Setting;
+import org.opensearch.common.settings.Setting.Property;
+import org.opensearch.common.settings.Settings;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+/**
+ * This {@link AllocationDecider} controls the number of currently in-progress
+ * re-balance (relocation) operations and restricts node allocations if the
+ * configured threshold is reached.
+ * <p>
+ * Re-balance operations can be controlled in real-time via the cluster update API using
+ * <code>cluster.routing.allocation.cluster_concurrent_recoveries</code>. Iff this
+ * setting is set to <code>-1</code> the number of cluster concurrent recoveries operations
+ * are unlimited.
+ */
+public class ConcurrentRecoveriesAllocationDecider extends AllocationDecider {
+
+    private static final Logger logger = LogManager.getLogger(ConcurrentRecoveriesAllocationDecider.class);
+
+    public static final String NAME = "cluster_concurrent_recoveries";
+
+
+    public static final Setting<Integer> CLUSTER_ROUTING_ALLOCATION_CLUSTER_CONCURRENT_RECOVERIES_SETTING =
+        Setting.intSetting("cluster.routing.allocation.cluster_concurrent_recoveries", -1, -1,
+            Property.Dynamic, Property.NodeScope);
+
+    private volatile int clusterConcurrentRecoveries;
+
+    public ConcurrentRecoveriesAllocationDecider(Settings settings, ClusterSettings clusterSettings) {
+        this.clusterConcurrentRecoveries = CLUSTER_ROUTING_ALLOCATION_CLUSTER_CONCURRENT_RECOVERIES_SETTING.get(settings);
+        logger.debug("using [cluster_concurrent_rebalance] with [{}]", clusterConcurrentRecoveries);
+        clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_CLUSTER_CONCURRENT_RECOVERIES_SETTING,
+            this::setClusterConcurrentRebalance);
+    }
+
+    private void setClusterConcurrentRebalance(int clusterConcurrentRecoveries) {
+        this.clusterConcurrentRecoveries = clusterConcurrentRecoveries;
+    }
+
+    @Override
+    public Decision canMoveAnyShard(RoutingAllocation allocation) {
+        if (clusterConcurrentRecoveries == -1) {
+            return allocation.decision(Decision.YES, NAME, "undefined cluster concurrent recoveries");
+        }
+        int relocatingShards = allocation.routingNodes().getRelocatingShardCount();
+        if (relocatingShards >= clusterConcurrentRecoveries) {
+            return allocation.decision(Decision.THROTTLE, NAME,
+                "too many shards are concurrently relocating [%d], limit: [%d] cluster setting [%s=%d]",
+                relocatingShards, clusterConcurrentRecoveries, CLUSTER_ROUTING_ALLOCATION_CLUSTER_CONCURRENT_RECOVERIES_SETTING.getKey(),
+                clusterConcurrentRecoveries);
+        }
+        return allocation.decision(Decision.YES, NAME,
+            "below threshold [%d] for concurrent recoveries, current relocating shard count [%d]",
+            clusterConcurrentRecoveries, relocatingShards);
+    }
+
+    @Override
+    public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
+        return canMoveAnyShard(allocation);
+    }
+
+}
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/Decision.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/Decision.java
@ -137,6 +137,10 @@ public abstract class Decision implements ToXContent, Writeable {
            return false;
        }

+        public boolean canPremptivelyReturn() {
+            return this == THROTTLE || this == NO;
+        }
+
    }

    /**
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDecider.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/FilterAllocationDecider.java
@ -147,6 +147,12 @@ public class FilterAllocationDecider extends AllocationDecider {
        return allocation.decision(Decision.YES, NAME, "node passes include/exclude/require filters");
    }

+    @Override
+    public Decision canAllocateAnyShardToNode(RoutingNode node, RoutingAllocation allocation) {
+        Decision decision = shouldClusterFilter(node.node(), allocation);
+        return decision != null && decision == Decision.NO ? decision : Decision.ALWAYS;
+    }
+
    private Decision shouldFilter(ShardRouting shardRouting, DiscoveryNode node, RoutingAllocation allocation) {
        Decision decision = shouldClusterFilter(node, allocation);
        if (decision != null) return decision;
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/ThrottlingAllocationDecider.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/ThrottlingAllocationDecider.java
@ -224,4 +224,44 @@ public class ThrottlingAllocationDecider extends AllocationDecider {
        assert initializingShard.initializing();
        return initializingShard;
    }
+
+    @Override
+    public Decision canMoveAway(ShardRouting shardRouting, RoutingAllocation allocation) {
+        int outgoingRecoveries = 0;
+        if (!shardRouting.primary()) {
+            ShardRouting primaryShard = allocation.routingNodes().activePrimary(shardRouting.shardId());
+            outgoingRecoveries = allocation.routingNodes().getOutgoingRecoveries(primaryShard.currentNodeId());
+        } else {
+            outgoingRecoveries = allocation.routingNodes().getOutgoingRecoveries(shardRouting.currentNodeId());
+        }
+        if (outgoingRecoveries >= concurrentOutgoingRecoveries) {
+            return allocation.decision(
+                THROTTLE, NAME,
+                "too many outgoing shards are currently recovering [%d], limit: [%d] cluster setting [%s=%d]",
+                outgoingRecoveries, concurrentOutgoingRecoveries,
+                CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING.getKey(),
+                concurrentOutgoingRecoveries
+            );
+        } else {
+            return allocation.decision(YES, NAME, "below shard recovery limit of outgoing: [%d < %d]", outgoingRecoveries,
+                concurrentOutgoingRecoveries);
+        }
+    }
+
+    @Override
+    public Decision canAllocateAnyShardToNode(RoutingNode node, RoutingAllocation allocation) {
+        int incomingRecoveries = allocation.routingNodes().getIncomingRecoveries(node.nodeId());
+        if (incomingRecoveries >= concurrentIncomingRecoveries) {
+            return allocation.decision(
+                THROTTLE, NAME,
+                "too many incoming shards are currently recovering [%d], limit: [%d] cluster setting [%s=%d]",
+                incomingRecoveries, concurrentIncomingRecoveries,
+                CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(),
+                concurrentIncomingRecoveries
+            );
+        } else {
+            return allocation.decision(YES, NAME, "below shard recovery limit of incoming: [%d < %d]", incomingRecoveries,
+                concurrentIncomingRecoveries);
+        }
+    }
 }
--- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
+++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java
@ -67,6 +67,7 @@ import org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocat
 import org.opensearch.cluster.routing.allocation.decider.AwarenessAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.ConcurrentRebalanceAllocationDecider;
+import org.opensearch.cluster.routing.allocation.decider.ConcurrentRecoveriesAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.DiskThresholdDecider;
 import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.FilterAllocationDecider;
@ -226,6 +227,7 @@ public final class ClusterSettings extends AbstractScopedSettings {
            BreakerSettings.CIRCUIT_BREAKER_TYPE,
            ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING,
            ConcurrentRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_CLUSTER_CONCURRENT_REBALANCE_SETTING,
+            ConcurrentRecoveriesAllocationDecider.CLUSTER_ROUTING_ALLOCATION_CLUSTER_CONCURRENT_RECOVERIES_SETTING,
            DanglingIndicesState.AUTO_IMPORT_DANGLING_INDICES_SETTING,
            EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING,
            EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING,
--- a/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java
+++ b/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java
@ -43,6 +43,7 @@ import org.opensearch.cluster.routing.allocation.decider.AllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.AwarenessAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.ConcurrentRebalanceAllocationDecider;
+import org.opensearch.cluster.routing.allocation.decider.ConcurrentRecoveriesAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.DiskThresholdDecider;
 import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider;
 import org.opensearch.cluster.routing.allocation.decider.FilterAllocationDecider;
@ -219,6 +220,7 @@ public class ClusterModuleTests extends ModuleTestCase {
            RebalanceOnlyWhenActiveAllocationDecider.class,
            ClusterRebalanceAllocationDecider.class,
            ConcurrentRebalanceAllocationDecider.class,
+            ConcurrentRecoveriesAllocationDecider.class,
            EnableAllocationDecider.class,
            NodeVersionAllocationDecider.class,
            SnapshotInProgressAllocationDecider.class,
--- a/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/AllocationDecidersTests.java
+++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/AllocationDecidersTests.java
@ -103,6 +103,18 @@ public class AllocationDecidersTests extends OpenSearchTestCase {
            public Decision canRebalance(RoutingAllocation allocation) {
                return Decision.YES;
            }
+
+            public Decision canMoveAway(ShardRouting shardRouting, RoutingAllocation allocation) {
+                return Decision.YES;
+            }
+
+            public Decision canMoveAnyShard(RoutingAllocation allocation) {
+                return Decision.YES;
+            }
+
+            public Decision canAllocateAnyShardToNode(RoutingNode node, RoutingAllocation allocation) {
+                return Decision.YES;
+            }
        }));

        ClusterState clusterState = ClusterState.builder(new ClusterName("test")).build();
@ -125,6 +137,9 @@ public class AllocationDecidersTests extends OpenSearchTestCase {
        verify(deciders.canRemain(shardRouting, routingNode, allocation), matcher);
        verify(deciders.canForceAllocatePrimary(shardRouting, routingNode, allocation), matcher);
        verify(deciders.shouldAutoExpandToNode(idx, null, allocation), matcher);
+        verify(deciders.canMoveAway(shardRouting, allocation), matcher);
+        verify(deciders.canMoveAnyShard(allocation), matcher);
+        verify(deciders.canAllocateAnyShardToNode(routingNode, allocation), matcher);
    }

    private void verify(Decision decision, Matcher<Collection<? extends Decision>> matcher) {
--- a/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/ConcurrentRecoveriesAllocationDeciderTests.java
+++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/decider/ConcurrentRecoveriesAllocationDeciderTests.java
@ -0,0 +1,207 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+package org.opensearch.cluster.routing.allocation.decider;
+
+import org.opensearch.Version;
+import org.opensearch.cluster.ClusterName;
+import org.opensearch.cluster.ClusterState;
+import org.opensearch.cluster.OpenSearchAllocationTestCase;
+import org.opensearch.cluster.metadata.IndexMetadata;
+import org.opensearch.cluster.metadata.Metadata;
+import org.opensearch.cluster.node.DiscoveryNodes;
+import org.opensearch.cluster.routing.RoutingTable;
+import org.opensearch.cluster.routing.ShardRouting;
+import org.opensearch.cluster.routing.ShardRoutingState;
+import org.opensearch.cluster.routing.allocation.AllocationService;
+import org.opensearch.common.collect.Tuple;
+import org.opensearch.common.settings.Settings;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+
+public class ConcurrentRecoveriesAllocationDeciderTests extends OpenSearchAllocationTestCase {
+
+    public void testClusterConcurrentRecoveries() {
+        int primaryShards = 5, replicaShards = 1, numberIndices = 12;
+        int clusterConcurrentRecoveries = -1;
+        int nodeConcurrentRecoveries = 4;
+        AllocationService initialStrategy = createAllocationService(
+            Settings.builder().put("cluster.routing.allocation.awareness.attributes", "zone")
+                    .put("cluster.routing.allocation.node_initial_primaries_recoveries", "8")
+                    .put("cluster.routing.allocation.node_concurrent_recoveries", String.valueOf(nodeConcurrentRecoveries))
+                    .put("cluster.routing.allocation.exclude.tag", "tag_0").build());
+
+        AllocationService excludeStrategy = null;
+
+        logger.info("Building initial routing table");
+
+        Metadata.Builder metadataBuilder = Metadata.builder();
+        for (int i = 0; i < numberIndices; i++) {
+            metadataBuilder.put(IndexMetadata.builder("test_" + i).settings(settings(Version.CURRENT)).numberOfShards(primaryShards)
+                                    .numberOfReplicas(replicaShards));
+        }
+        RoutingTable.Builder initialRoutingTableBuilder = RoutingTable.builder();
+        Metadata metadata = metadataBuilder.build();
+        for (int i = 0; i < numberIndices; i++) {
+            initialRoutingTableBuilder.addAsNew(metadata.index("test_" + i));
+        }
+        RoutingTable routingTable = initialRoutingTableBuilder.build();
+
+        logger.info("--> adding nodes and starting shards");
+
+        List<Tuple<Integer, Integer>> srcTargetNodes = Collections.unmodifiableList(Arrays.<Tuple<Integer, Integer>>asList(
+            new Tuple(10, 4),
+            new Tuple(4, 10),
+            new Tuple(10, 10))
+        );
+
+        for (Tuple<Integer, Integer> srcTargetNode : srcTargetNodes) {
+
+            int srcNodes = srcTargetNode.v1();
+            int targetNodes = srcTargetNode.v2();
+
+            logger.info("Setting up tests for src node {} and target node {}", srcNodes, targetNodes);
+
+            ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata)
+                                            .routingTable(routingTable).nodes(setUpClusterNodes(srcNodes, targetNodes)).build();
+
+            clusterState = initialStrategy.reroute(clusterState, "reroute");
+
+            // Initialize shards
+
+            logger.info("--> Starting primary shards");
+            while (clusterState.getRoutingNodes().hasUnassignedShards()) {
+                clusterState = startInitializingShardsAndReroute(initialStrategy, clusterState);
+            }
+
+            logger.info("--> Starting replica shards");
+            while (clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size() > 0) {
+                clusterState = startInitializingShardsAndReroute(initialStrategy, clusterState);
+            }
+
+            assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(),
+                equalTo((replicaShards + 1) * primaryShards * numberIndices));
+            assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(0));
+            assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.RELOCATING).size(), equalTo(0));
+
+            clusterConcurrentRecoveries = Math.min(srcNodes, targetNodes) * nodeConcurrentRecoveries;
+            excludeStrategy = createAllocationService(Settings.builder().put("cluster.routing.allocation.awareness.attributes", "zone")
+                    .put("cluster.routing.allocation.node_concurrent_recoveries", String.valueOf(nodeConcurrentRecoveries))
+                    .put("cluster.routing.allocation.cluster_concurrent_recoveries", String.valueOf(clusterConcurrentRecoveries))
+                    .put("cluster.routing.allocation.exclude.tag", "tag_1").build());
+
+            for (int counter = 0; counter < 3; counter++) {
+                logger.info("--> Performing a reroute ");
+                clusterState = excludeStrategy.reroute(clusterState, "reroute");
+                assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.RELOCATING).size(),
+                    equalTo(clusterConcurrentRecoveries));
+                for (ShardRouting startedShard : clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED)) {
+                    assertThat(clusterState.getRoutingNodes().node(startedShard.currentNodeId()).node().getAttributes().get("tag"),
+                        equalTo("tag_1"));
+                }
+            }
+
+            // Ensure all shards are started
+            while (clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size() > 0) {
+                clusterState = startInitializingShardsAndReroute(initialStrategy, clusterState);
+            }
+
+            clusterConcurrentRecoveries = clusterConcurrentRecoveries - randomInt(5);
+            excludeStrategy = createAllocationService(Settings.builder().put("cluster.routing.allocation.awareness.attributes", "zone")
+                    .put("cluster.routing.allocation.node_concurrent_recoveries", String.valueOf(nodeConcurrentRecoveries))
+                    .put("cluster.routing.allocation.cluster_concurrent_recoveries", String.valueOf(clusterConcurrentRecoveries))
+                    .put("cluster.routing.allocation.exclude.tag", "tag_1").build());
+
+            for (int counter = 0; counter < 3; counter++) {
+                logger.info("--> Performing a reroute ");
+                clusterState = excludeStrategy.reroute(clusterState, "reroute");
+                assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.RELOCATING).size(),
+                    equalTo(clusterConcurrentRecoveries));
+                for (ShardRouting startedShard : clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED)) {
+                    assertThat(clusterState.getRoutingNodes().node(startedShard.currentNodeId()).node().getAttributes().get("tag"),
+                        equalTo("tag_1"));
+                }
+            }
+
+            // Ensure all shards are started
+            while (clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size() > 0) {
+                clusterState = startInitializingShardsAndReroute(initialStrategy, clusterState);
+            }
+
+            logger.info("--> Disabling cluster_concurrent_recoveries and re-routing ");
+            clusterConcurrentRecoveries = Math.min(srcNodes, targetNodes) * nodeConcurrentRecoveries;
+
+            for (int counter = 0; counter < 3; counter++) {
+                logger.info("--> Performing a reroute ");
+                excludeStrategy = createAllocationService(
+                    Settings.builder().put("cluster.routing.allocation.awareness.attributes", "zone")
+                            .put("cluster.routing.allocation.node_concurrent_recoveries", String.valueOf(nodeConcurrentRecoveries))
+                            .put("cluster.routing.allocation.exclude.tag", "tag_1").build());
+
+                clusterState = excludeStrategy.reroute(clusterState, "reroute");
+                //When srcNodes < targetNodes relocations go beyond the Math.min(srcNodes, targetNodes) * nodeConcurrentRecoveries limit as
+                // outgoing recoveries happens target nodes which anyways doesn't get throttled on incoming recoveries
+                if (srcNodes >= targetNodes) {
+                    assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.RELOCATING).size(),
+                        equalTo(clusterConcurrentRecoveries));
+                }else {
+                    assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.RELOCATING).size(),
+                        greaterThanOrEqualTo(clusterConcurrentRecoveries));
+                }
+
+            }
+            // Ensure all shards are started
+            while (clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size() > 0) {
+                clusterState = startInitializingShardsAndReroute(initialStrategy, clusterState);
+            }
+
+            logger.info("--> Bumping cluster_concurrent_recoveries up and re-routing ");
+            clusterConcurrentRecoveries = clusterConcurrentRecoveries + randomInt(5);
+            int expectedClusterConcurrentRecoveries = Math.min(srcNodes, targetNodes) * nodeConcurrentRecoveries;
+            for (int counter = 0; counter < 3; counter++) {
+                logger.info("--> Performing a reroute ");
+                excludeStrategy = createAllocationService(Settings.builder().put("cluster.routing.allocation.awareness.attributes", "zone")
+                        .put("cluster.routing.allocation.node_concurrent_recoveries", String.valueOf(nodeConcurrentRecoveries))
+                        .put("cluster.routing.allocation.exclude.tag", "tag_1").build());
+                clusterState = excludeStrategy.reroute(clusterState, "reroute");
+                assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.RELOCATING).size(),
+                        equalTo(expectedClusterConcurrentRecoveries));
+
+            }
+        }
+    }
+
+    private DiscoveryNodes.Builder setUpClusterNodes(int sourceNodes, int targetNodes) {
+        DiscoveryNodes.Builder nb = DiscoveryNodes.builder();
+        for (int i = 1; i <= sourceNodes; i++) {
+            Map<String, String> attributes = new HashMap<>();
+            attributes.put("tag", "tag_" + 1);
+            attributes.put("zone", "zone_" + (i % 2));
+            nb.add(newNode("node_s_" + i, attributes));
+        }
+        for (int j = 1; j <= targetNodes; j++) {
+            Map<String, String> attributes = new HashMap<>();
+            attributes.put("tag", "tag_" + 0);
+            attributes.put("zone", "zone_" + (j % 2));
+            nb.add(newNode("node_t_" + j, attributes));
+        }
+        return nb;
+    }
+}