[1.x] An allocation constraint mechanism, that de-prioritizes nodes from getting picked for allocation if they breach certain constraints (#777)

* An allocation constraint mechanism, that de-prioritizes nodes from getting picked for allocation if they breach certain constraints Signed-off-by: Ashwin Pankaj <appankaj@amazon.com> * Precommit fixes Signed-off-by: Ashwin Pankaj <appankaj@amazon.com> * Fix license header in test file Signed-off-by: Ashwin Pankaj <appankaj@amazon.com> * Review comments Signed-off-by: Ashwin Pankaj <appankaj@amazon.com> Co-authored-by: Pankaj <appankaj@88665a1e205d.ant.amazon.com>
2021-06-04 10:47:47 +05:30 · 2021-06-04 10:47:47 +05:30 · b4c697a29d
parent 4f5b48f722
commit b4c697a29d
7 changed files with 862 additions and 13 deletions
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/AllocationConstraints.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/AllocationConstraints.java
@ -0,0 +1,86 @@
 /*
 * Copyright OpenSearch Contributors.
 * SPDX-License-Identifier: Apache-2.0
 */
 package org.opensearch.cluster.routing.allocation;
 import org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.function.Predicate;
 /**
 * Allocation constraints specify conditions which, if breached, reduce the
 * priority of a node for receiving shard allocations.
 */
 public class AllocationConstraints {
    public final long CONSTRAINT_WEIGHT = 1000000L;
    private List<Predicate<ConstraintParams>> constraintPredicates;
    public AllocationConstraints() {
        this.constraintPredicates = new ArrayList<>(1);
        this.constraintPredicates.add(isIndexShardsPerNodeBreached());
    }
    class ConstraintParams {
        private BalancedShardsAllocator.Balancer balancer;
        private BalancedShardsAllocator.ModelNode node;
        private String index;
        ConstraintParams(BalancedShardsAllocator.Balancer balancer, BalancedShardsAllocator.ModelNode node,
                String index) {
            this.balancer = balancer;
            this.node = node;
            this.index = index;
        }
    }
    /**
     * Evaluates configured allocation constraint predicates for given node - index
     * combination; and returns a weight value based on the number of breached
     * constraints.
     *
     * Constraint weight should be added to the weight calculated via weight
     * function, to reduce priority of allocating on nodes with breached
     * constraints.
     *
     * This weight function is used only in case of unassigned shards to avoid overloading a newly added node.
     * Weight calculation in other scenarios like shard movement and re-balancing remain unaffected by this function.
     */
    public long weight(BalancedShardsAllocator.Balancer balancer, BalancedShardsAllocator.ModelNode node,
            String index) {
        int constraintsBreached = 0;
        ConstraintParams params = new ConstraintParams(balancer, node, index);
        for (Predicate<ConstraintParams> predicate : constraintPredicates) {
            if (predicate.test(params)) {
                constraintsBreached++;
            }
        }
        return constraintsBreached * CONSTRAINT_WEIGHT;
    }
    /**
     * Constraint to control number of shards of an index allocated on a single
     * node.
     *
     * In current weight function implementation, when a node has significantly
     * fewer shards than other nodes (e.g. during single new node addition or node
     * replacement), its weight is much less than other nodes. All shard allocations
     * at this time tend to land on the new node with skewed weight. This breaks
     * index level balance in the cluster, by creating all shards of the same index
     * on one node, often resulting in a hotspot on that node.
     *
     * This constraint is breached when balancer attempts to allocate more than
     * average shards per index per node.
     */
    private Predicate<ConstraintParams> isIndexShardsPerNodeBreached() {
        return (params) -> {
            int currIndexShardsOnNode = params.node.numShards(params.index);
            int allowedIndexShardsPerNode = (int) Math.ceil(params.balancer.avgShardsPerNode(params.index));
            return (currIndexShardsOnNode >= allowedIndexShardsPerNode);
        };
    }
 }
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
@ -45,6 +45,7 @@ import org.opensearch.cluster.routing.ShardRoutingState;
 import org.opensearch.cluster.routing.UnassignedInfo;
 import org.opensearch.cluster.routing.UnassignedInfo.AllocationStatus;
 import org.opensearch.cluster.routing.allocation.AllocateUnassignedDecision;
 import org.opensearch.cluster.routing.allocation.AllocationConstraints;
 import org.opensearch.cluster.routing.allocation.AllocationDecision;
 import org.opensearch.cluster.routing.allocation.MoveDecision;
 import org.opensearch.cluster.routing.allocation.NodeAllocationResult;
@ -192,7 +193,6 @@ public class BalancedShardsAllocator implements ShardsAllocator {
        return weightFunction.shardBalance;
    }
    /**
     * This class is the primary weight function used to create balanced over nodes and shards in the cluster.
     * Currently this function has 3 properties:
@ -216,13 +216,16 @@ public class BalancedShardsAllocator implements ShardsAllocator {
     * </li>
     * </ul>
     * <code>weight(node, index) = weight<sub>index</sub>(node, index) + weight<sub>node</sub>(node, index)</code>
     *
     * package-private for testing
     */
-    private static class WeightFunction {
+    static class WeightFunction {
        private final float indexBalance;
        private final float shardBalance;
        private final float theta0;
        private final float theta1;
        private AllocationConstraints constraints;
        WeightFunction(float indexBalance, float shardBalance) {
            float sum = indexBalance + shardBalance;
@ -233,6 +236,12 @@ public class BalancedShardsAllocator implements ShardsAllocator {
            theta1 = indexBalance / sum;
            this.indexBalance = indexBalance;
            this.shardBalance = shardBalance;
            this.constraints = new AllocationConstraints();
        }
        public float weightWithAllocationConstraints(Balancer balancer, ModelNode node, String index) {
            float balancerWeight = weight(balancer, node, index);
            return balancerWeight + constraints.weight(balancer, node, index);
        }
        float weight(Balancer balancer, ModelNode node, String index) {
@ -411,7 +420,10 @@ public class BalancedShardsAllocator implements ShardsAllocator {
                    boolean deltaAboveThreshold = lessThan(currentDelta, threshold) == false;
                    // calculate the delta of the weights of the two nodes if we were to add the shard to the
                    // node in question and move it away from the node that currently holds it.
-                    boolean betterWeightWithShardAdded = nodeWeight + 1.0f < currentWeight;
+                    // hence we add 2.0f to the weight delta
                    float proposedDelta = 2.0f + nodeWeight - currentWeight;
                    boolean betterWeightWithShardAdded = proposedDelta < currentDelta;
                    rebalanceConditionsMet = deltaAboveThreshold && betterWeightWithShardAdded;
                    // if the simulated weight delta with the shard moved away is better than the weight delta
                    // with the shard remaining on the current node, and we are allowed to allocate to the
@ -964,7 +976,7 @@ public class BalancedShardsAllocator implements ShardsAllocator {
                }
                // weight of this index currently on the node
-                float currentWeight = weight.weight(this, node, shard.getIndexName());
+                float currentWeight = weight.weightWithAllocationConstraints(this, node, shard.getIndexName());
                // moving the shard would not improve the balance, and we are not in explain mode, so short circuit
                if (currentWeight > minWeight && explain == false) {
                    continue;
@ -1002,7 +1014,7 @@ public class BalancedShardsAllocator implements ShardsAllocator {
                            updateMinNode = currentDecision.type() == Type.YES;
                        }
                    } else {
-                        updateMinNode = true;
+                        updateMinNode = currentWeight < minWeight;
                    }
                    if (updateMinNode) {
                        minNode = node;
@ -1086,7 +1098,7 @@ public class BalancedShardsAllocator implements ShardsAllocator {
    }
-    static class ModelNode implements Iterable<ModelIndex> {
+    public static class ModelNode implements Iterable<ModelIndex> {
        private final Map<String, ModelIndex> indices = new HashMap<>();
        private int numShards = 0;
        private final RoutingNode routingNode;
--- a/server/src/test/java/org/opensearch/cluster/routing/allocation/AllocationConstraintsTests.java
+++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/AllocationConstraintsTests.java
@ -0,0 +1,65 @@
 /*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */
 package org.opensearch.cluster.routing.allocation;
 import org.opensearch.cluster.OpenSearchAllocationTestCase;
 import org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
 import org.opensearch.common.settings.ClusterSettings;
 import org.opensearch.common.settings.Settings;
 import static org.mockito.Matchers.anyString;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 public class AllocationConstraintsTests extends OpenSearchAllocationTestCase {
    public void testSettings() {
        Settings.Builder settings = Settings.builder();
        ClusterSettings service = new ClusterSettings(Settings.builder().build(),
            ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
        BalancedShardsAllocator allocator = new BalancedShardsAllocator(settings.build(), service);
        settings = Settings.builder();
        float indexBalanceFactor = randomFloat();
        float shardBalance = randomFloat();
        float threshold = randomFloat();
        settings.put(BalancedShardsAllocator.INDEX_BALANCE_FACTOR_SETTING.getKey(), indexBalanceFactor);
        settings.put(BalancedShardsAllocator.SHARD_BALANCE_FACTOR_SETTING.getKey(), shardBalance);
        settings.put(BalancedShardsAllocator.THRESHOLD_SETTING.getKey(), threshold);
        service.applySettings(settings.build());
        assertEquals(indexBalanceFactor, allocator.getIndexBalance(), 0.01);
        assertEquals(shardBalance, allocator.getShardBalance(), 0.01);
        assertEquals(threshold, allocator.getThreshold(), 0.01);
    }
    /**
     * Test constraint evaluation logic when with different values of ConstraintMode
     * for IndexShardPerNode constraint satisfied and breached.
     */
    public void testIndexShardsPerNodeConstraint() {
        BalancedShardsAllocator.Balancer balancer = mock(BalancedShardsAllocator.Balancer.class);
        BalancedShardsAllocator.ModelNode node = mock(BalancedShardsAllocator.ModelNode.class);
        AllocationConstraints constraints = new AllocationConstraints();
        int shardCount = randomIntBetween(1, 500);
        float avgShardsPerNode = 1.0f + (random().nextFloat()) * 999.0f;
        when(balancer.avgShardsPerNode(anyString())).thenReturn(avgShardsPerNode);
        when(node.numShards(anyString())).thenReturn(shardCount);
        when(node.getNodeId()).thenReturn("test-node");
        long expectedWeight = (shardCount >= avgShardsPerNode) ? constraints.CONSTRAINT_WEIGHT : 0;
        assertEquals(expectedWeight, constraints.weight(balancer, node, "index"));
    }
 }
--- a/server/src/test/java/org/opensearch/cluster/routing/allocation/IndexShardConstraintDeciderOverlapTests.java
+++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/IndexShardConstraintDeciderOverlapTests.java
@ -0,0 +1,202 @@
 /*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */
 package org.opensearch.cluster.routing.allocation;
 import org.opensearch.cluster.ClusterInfo;
 import org.opensearch.cluster.ClusterInfoService;
 import org.opensearch.cluster.ClusterState;
 import org.opensearch.cluster.DiskUsage;
 import org.opensearch.cluster.OpenSearchAllocationWithConstraintsTestCase;
 import org.opensearch.cluster.node.DiscoveryNodes;
 import org.opensearch.cluster.routing.ShardRouting;
 import org.opensearch.common.collect.ImmutableOpenMap;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.index.shard.ShardId;
 import org.opensearch.test.VersionUtils;
 import java.util.HashMap;
 import java.util.Map;
 import static org.opensearch.cluster.routing.ShardRoutingState.STARTED;
 import static org.opensearch.cluster.routing.ShardRoutingState.UNASSIGNED;
 public class IndexShardConstraintDeciderOverlapTests extends OpenSearchAllocationWithConstraintsTestCase {
    /**
     * High watermark breach blocks new shard allocations to affected nodes. If shard count on such
     * nodes is low, this will cause IndexShardPerNodeConstraint to breach.
     *
     * This test verifies that this doesn't lead to unassigned shards, and there are no hot spots in eligible
     * nodes.
     */
    public void testHighWatermarkBreachWithLowShardCount() {
        setupInitialCluster(3, 15, 10, 1);
        addNodesWithIndexing(1, "high_watermark_node_", 6, 5, 1);
        // Disk threshold settings enabled
        Settings settings = Settings.builder()
            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true)
            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.7)
            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.8)
            .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), 0.95)
            .put("cluster.routing.allocation.node_concurrent_recoveries", 1)
            .put("cluster.routing.allocation.cluster_concurrent_recoveries", 1)
            .build();
        // Build Shard size and disk usages
        ImmutableOpenMap.Builder<String, DiskUsage> usagesBuilder = ImmutableOpenMap.builder();
        usagesBuilder.put("node_0", new DiskUsage("node_0", "node_0", "/dev/null", 100, 80)); // 20% used
        usagesBuilder.put("node_1", new DiskUsage("node_1", "node_1", "/dev/null", 100, 55)); // 45% used
        usagesBuilder.put("node_2", new DiskUsage("node_2", "node_2", "/dev/null", 100, 35)); // 65% used
        usagesBuilder.put("high_watermark_node_0",
            new DiskUsage("high_watermark_node_0", "high_watermark_node_0", "/dev/null", 100, 10)); // 90% used
        ImmutableOpenMap<String, DiskUsage> usages = usagesBuilder.build();
        ImmutableOpenMap.Builder<String, Long> shardSizesBuilder = ImmutableOpenMap.builder();
        clusterState.getRoutingTable().allShards().forEach(shard ->
            shardSizesBuilder.put(shardIdentifierFromRouting(shard), 1L)); // Each shard is 1 byte
        ImmutableOpenMap<String, Long> shardSizes = shardSizesBuilder.build();
        final ImmutableOpenMap<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> reservedSpace =
            new ImmutableOpenMap.Builder<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace>()
                .fPut(
                    getNodeAndDevNullPath("node_0"), getReservedSpace()
                )
                .fPut(
                    getNodeAndDevNullPath("node_1"), getReservedSpace()
                )
                .fPut(
                    getNodeAndDevNullPath("node_2"), getReservedSpace()
                )
                .fPut(
                    getNodeAndDevNullPath("high_watermark_node_0"), getReservedSpace()
                )
                .build();
        final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes, reservedSpace);
        ClusterInfoService cis = () -> clusterInfo;
        allocation = createAllocationService(settings, cis);
        allocateAndCheckIndexShardHotSpots(false, 3, "node_0", "node_1", "node_2");
        assertForIndexShardHotSpots(true, 4);
        assertTrue(clusterState.getRoutingTable().shardsWithState(UNASSIGNED).isEmpty());
        assertTrue(clusterState.getRoutingNodes().node("high_watermark_node_0").isEmpty());
        /* Shard sizes that would breach high watermark on node_2 if allocated.
         */
        addIndices("big_index_", 1, 10, 0);
        ImmutableOpenMap.Builder<String, Long>  bigIndexShardSizeBuilder = ImmutableOpenMap.builder(shardSizes);
        clusterState.getRoutingNodes().unassigned()
            .forEach(shard -> bigIndexShardSizeBuilder.put(shardIdentifierFromRouting(shard), 20L));
        shardSizes = bigIndexShardSizeBuilder.build();
        final ClusterInfo bigIndexClusterInfo = new DevNullClusterInfo(usages, usages, shardSizes, reservedSpace);
        cis = () -> bigIndexClusterInfo;
        allocation = createAllocationService(settings, cis);
        allocateAndCheckIndexShardHotSpots(false, 2, "node_0", "node_1");
        assertForIndexShardHotSpots(true, 4);
        assertTrue(clusterState.getRoutingTable().shardsWithState(UNASSIGNED).isEmpty());
        for (ShardRouting shard: clusterState.getRoutingTable().index("big_index_0").shardsWithState(STARTED)) {
            assertNotEquals("node_2", shard.currentNodeId());
        }
    }
    private ClusterInfo.NodeAndPath getNodeAndDevNullPath(String node) {
        return new ClusterInfo.NodeAndPath(node, "/dev/null");
    }
    private ClusterInfo.ReservedSpace getReservedSpace() {
        return new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0),  2).build();
    }
    /**
     * Test clusters with subset of nodes on older version.
     * New version shards should not migrate to old version nodes, even if this creates potential hot spots.
     */
    public void testNodeVersionCompatibilityOverlap() {
        setupInitialCluster(3, 6, 10, 1);
        // Add an old version node and exclude a new version node
        DiscoveryNodes.Builder nb = DiscoveryNodes.builder(clusterState.nodes())
            .add(newNode("old_node", VersionUtils.getPreviousVersion()));
        clusterState = ClusterState.builder(clusterState).nodes(nb.build()).build();
        buildAllocationService("node_0");
        // Shards should only go to remaining new version nodes
        allocateAndCheckIndexShardHotSpots(false, 2, "node_1", "node_2");
        assertForIndexShardHotSpots(true, 4);
        assertTrue(clusterState.getRoutingTable().shardsWithState(UNASSIGNED).isEmpty());
        for (ShardRouting shard: clusterState.getRoutingTable().allShards()) {
            assertNotEquals("node_0", shard.currentNodeId());
            assertNotEquals("old_node", shard.currentNodeId());
        }
    }
    /**
     * Test zone aware clusters with balanced zones.
     * No hot spots expected.
     */
    public void testZoneBalanced() {
        Map<String, Integer> nodesPerZone = new HashMap<>();
        nodesPerZone.put("zone_0", 3);
        nodesPerZone.put("zone_1", 3);
        createEmptyZoneAwareCluster(nodesPerZone);
        addIndices("index_", 4, 5, 1);
        buildZoneAwareAllocationService();
        allocateAndCheckIndexShardHotSpots(false, 6);
        resetCluster();
        buildZoneAwareAllocationService();
        allocateAndCheckIndexShardHotSpots(false, 6);
    }
    /**
     * Test zone aware clusters with unbalanced zones.
     * Hot spots expected as awareness forces shards per zone restrictions.
     */
    public void testZoneUnbalanced() {
        Map<String, Integer> nodesPerZone = new HashMap<>();
        nodesPerZone.put("zone_0", 5);
        nodesPerZone.put("zone_1", 1);
        createEmptyZoneAwareCluster(nodesPerZone);
        addIndices("index_", 1, 5, 1);
        updateInitialCluster();
        buildZoneAwareAllocationService();
        clusterState = allocateShardsAndBalance(clusterState);
        assertForIndexShardHotSpots(true, 6);
        assertTrue(clusterState.getRoutingTable().shardsWithState(UNASSIGNED).isEmpty());
        resetCluster();
        buildZoneAwareAllocationService();
        clusterState = allocateShardsAndBalance(clusterState);
        assertForIndexShardHotSpots(true, 6);
        assertTrue(clusterState.getRoutingTable().shardsWithState(UNASSIGNED).isEmpty());
    }
    /**
     * ClusterInfo that always points to DevNull.
     */
    public static class DevNullClusterInfo extends ClusterInfo {
        public DevNullClusterInfo(ImmutableOpenMap<String, DiskUsage> leastAvailableSpaceUsage,
                                  ImmutableOpenMap<String, DiskUsage> mostAvailableSpaceUsage,
                                  ImmutableOpenMap<String, Long> shardSizes,
                                  ImmutableOpenMap<NodeAndPath, ReservedSpace> reservedSpace) {
            super(leastAvailableSpaceUsage, mostAvailableSpaceUsage, shardSizes, null, reservedSpace);
        }
        @Override
        public String getDataPath(ShardRouting shardRouting) {
            return "/dev/null";
        }
    }
 }
--- a/server/src/test/java/org/opensearch/cluster/routing/allocation/IndexShardHotSpotTests.java
+++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/IndexShardHotSpotTests.java
@ -0,0 +1,123 @@
 /*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */
 package org.opensearch.cluster.routing.allocation;
 import org.opensearch.cluster.OpenSearchAllocationWithConstraintsTestCase;
 public class IndexShardHotSpotTests extends OpenSearchAllocationWithConstraintsTestCase {
    /**
     * Test single node replacement without active indexing.
     */
    public void testNodeReplacement() {
        setupInitialCluster(5, 1, 5, 1);
        terminateNodes("node_1");
        assertForIndexShardHotSpots(false, 4);
        addNodesWithoutIndexing(1, "new_node_");
        int movesForModeNone = allocateAndCheckIndexShardHotSpots(false, 5, "new_node_0");
        setupInitialCluster(5, 1, 5, 1);
        terminateNodes("node_1");
        assertForIndexShardHotSpots(false, 4);
        addNodesWithoutIndexing(1, "new_node_");
        int movesForModeUnassigned = allocateAndCheckIndexShardHotSpots(false, 5, "new_node_0");
        assertTrue(movesForModeUnassigned <= movesForModeNone);
    }
    /**
     * Test single node replacement with active indexing.
     */
    public void testNodeReplacementWithIndexing() {
        setupInitialCluster(5, 30, 5, 1);
        buildAllocationService();
        terminateNodes("node_1");
        assertForIndexShardHotSpots(false, 4);
        addNodesWithIndexing(1, "new_node_", 3, 20, 1);
        int movesForModeNone = allocateAndCheckIndexShardHotSpots(false, 5, "new_node_0");
        resetCluster();
        buildAllocationService();
        terminateNodes("node_1");
        assertForIndexShardHotSpots(false, 4);
        addNodesWithIndexing(1, "new_node_", 3, 20, 1);
        int movesForModeUnassigned = allocateAndCheckIndexShardHotSpots(false, 5, "new_node_0");
        assertTrue(movesForModeUnassigned <= movesForModeNone);
    }
    /**
     * Test skewed cluster scale out via single node addition during active indexing.
     */
    public void testSkewedClusterScaleOut() {
        setupInitialCluster(3, 30, 10, 1);
        buildAllocationService();
        addNodesWithIndexing(1, "new_node_", 8, 10, 1);
        int movesForModeNone = allocateAndCheckIndexShardHotSpots( false, 4, "new_node_0");
        resetCluster();
        buildAllocationService();
        addNodesWithIndexing(1, "new_node_", 8, 10, 1);
        int movesForModeUnassigned = allocateAndCheckIndexShardHotSpots(false, 4, "new_node_0");
        assertTrue(movesForModeUnassigned <= movesForModeNone);
    }
    /**
     * Test under replicated yellow cluster scale out to green.
     *
     * This scenario is not expected to create hotspots even without constraints enabled. The
     * test is a sanity check to ensure allocation constraints don't worsen the situation.
     */
    public void testUnderReplicatedClusterScaleOut() {
        setupInitialCluster(3, 30, 10, 3);
        buildAllocationService();
        addNodesWithoutIndexing(1, "new_node_");
        int movesForModeNone = allocateAndCheckIndexShardHotSpots(false, 4, "new_node_0");
        resetCluster();
        buildAllocationService();
        addNodesWithoutIndexing(1, "new_node_");
        int movesForModeUnassigned = allocateAndCheckIndexShardHotSpots(false, 4, "new_node_0");
        assertTrue(movesForModeUnassigned <= movesForModeNone);
    }
    /**
     * Test cluster scale in scenario, when nodes are gracefully excluded from
     * cluster before termination.
     *
     * During moveShards(), shards are picked from across indexes in an interleaved manner.
     * This prevents hot spots by evenly picking up shards. Since shard order can change
     * in subsequent runs, we are not guaranteed to less moves than no allocation constraint run.
     *
     * Move tests are hence just a sanity test, to ensure we don't create any unexpected hot spots with
     * allocation settings.
     */
    public void testClusterScaleIn() {
        setupInitialCluster(4, 30, 10, 1);
        buildAllocationService("node_0,node_1");
        allocateAndCheckIndexShardHotSpots(false, 2, "node_2", "node_3");
        resetCluster();
        buildAllocationService("node_0,node_1");
        allocateAndCheckIndexShardHotSpots(false, 2, "node_2", "node_3");
    }
    /**
     * Test cluster scale in scenario with skewed shard distribution in remaining nodes.
     */
    public void testClusterScaleInWithSkew() {
        setupInitialCluster(4, 100, 5, 1);
        buildAllocationService("node_0,node_1");
        addNodesWithoutIndexing(1, "new_node_");
        allocateAndCheckIndexShardHotSpots(false, 3, "node_2", "node_3", "new_node_0");
        resetCluster();
        buildAllocationService("node_0,node_1");
        addNodesWithoutIndexing(1, "new_node_");
        allocateAndCheckIndexShardHotSpots(false, 3, "node_2", "node_3", "new_node_0");
    }
 }
--- a/server/src/test/java/org/opensearch/cluster/routing/allocation/ShardsLimitAllocationTests.java
+++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/ShardsLimitAllocationTests.java
@ -34,6 +34,7 @@ package org.opensearch.cluster.routing.allocation;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.hamcrest.Matcher;
 import org.opensearch.Version;
 import org.opensearch.cluster.ClusterState;
 import org.opensearch.cluster.OpenSearchAllocationTestCase;
@ -48,7 +49,13 @@ import org.opensearch.common.settings.Settings;
 import static org.opensearch.cluster.routing.ShardRoutingState.RELOCATING;
 import static org.opensearch.cluster.routing.ShardRoutingState.STARTED;
 import java.util.Set;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.anyOf;
 public class ShardsLimitAllocationTests extends OpenSearchAllocationTestCase {
    private final Logger logger = LogManager.getLogger(ShardsLimitAllocationTests.class);
@ -201,13 +208,20 @@ public class ShardsLimitAllocationTests extends OpenSearchAllocationTestCase {
        clusterState = startInitializingShardsAndReroute(strategy, clusterState);
-        assertThat(RoutingNodesUtils.numberOfShardsOfType(clusterState.getRoutingNodes(), STARTED), equalTo(10));
+        // AllocationConstraints will choose node2 for 3 test1 shards and the rest are assigned to node1
        // These shards then relocate to node2 after balanceByWeights() is invoked
        assertThat(RoutingNodesUtils.numberOfShardsOfType(clusterState.getRoutingNodes(), STARTED), equalTo(8));
        assertThat(RoutingNodesUtils.numberOfShardsOfType(clusterState.getRoutingNodes(), RELOCATING), equalTo(2));
        // Expected values depend on which index shards were chosen for relocation.
        String relocatingIndex = getRelocatingIndex(clusterState);
        Matcher<String> node1Matcher = anyOf(equalTo("test"), equalTo("test1"));
        Matcher<String> node2Matcher = relocatingIndex.equals("test1") ? equalTo("test1") : node1Matcher;
        for (ShardRouting shardRouting : clusterState.getRoutingNodes().node("node1")) {
-            assertThat(shardRouting.getIndexName(), equalTo("test"));
+                assertThat(shardRouting.getIndexName(), node1Matcher);
        }
        for (ShardRouting shardRouting : clusterState.getRoutingNodes().node("node2")) {
-            assertThat(shardRouting.getIndexName(), equalTo("test1"));
+                assertThat(shardRouting.getIndexName(), node2Matcher);
        }
        logger.info("update {} for test, see that things move",
@ -226,10 +240,12 @@ public class ShardsLimitAllocationTests extends OpenSearchAllocationTestCase {
        logger.info("reroute after setting");
        clusterState = strategy.reroute(clusterState, "reroute");
-        assertThat(clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(STARTED), equalTo(3));
+        int node1RelocCount = relocatingIndex.equals("test1") ? 4 : 2;
-        assertThat(clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(RELOCATING), equalTo(2));
+        assertThat(clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(RELOCATING), equalTo(node1RelocCount));
-        assertThat(clusterState.getRoutingNodes().node("node2").numberOfShardsWithState(RELOCATING), equalTo(2));
+
-        assertThat(clusterState.getRoutingNodes().node("node2").numberOfShardsWithState(STARTED), equalTo(3));
+        int node2RelocCount = relocatingIndex.equals("test1") ? 2 : 0;
        assertThat(clusterState.getRoutingNodes().node("node2").numberOfShardsWithState(RELOCATING), equalTo(node2RelocCount));
        assertThat(clusterState.getRoutingNodes().node("node2").numberOfShardsWithState(STARTED), equalTo(3 - node2RelocCount));
        // the first move will destroy the balance and the balancer will move 2 shards from node2 to node one right after
        // moving the nodes to node2 since we consider INITIALIZING nodes during rebalance
        clusterState = startInitializingShardsAndReroute(strategy, clusterState);
@ -237,4 +253,13 @@ public class ShardsLimitAllocationTests extends OpenSearchAllocationTestCase {
        assertThat(clusterState.getRoutingNodes().node("node1").numberOfShardsWithState(STARTED), equalTo(5));
        assertThat(clusterState.getRoutingNodes().node("node2").numberOfShardsWithState(STARTED), equalTo(5));
    }
    private String getRelocatingIndex(ClusterState clusterState) {
            final Set<String> indices = Stream.of(clusterState.getRoutingNodes())
                            .flatMap(node -> node.shardsWithState(RELOCATING).stream().map(x -> x.getIndexName()))
                            .collect(Collectors.toSet());
            assertThat(indices.size(), equalTo(1));
            return indices.iterator().next();
    }
 }
--- a/test/framework/src/main/java/org/opensearch/cluster/OpenSearchAllocationWithConstraintsTestCase.java
+++ b/test/framework/src/main/java/org/opensearch/cluster/OpenSearchAllocationWithConstraintsTestCase.java
@ -0,0 +1,336 @@
 /*
 * Copyright OpenSearch Contributors.
 * SPDX-License-Identifier: Apache-2.0
 */
 package org.opensearch.cluster;
 import org.opensearch.Version;
 import org.opensearch.cluster.metadata.IndexMetadata;
 import org.opensearch.cluster.metadata.Metadata;
 import org.opensearch.cluster.node.DiscoveryNodes;
 import org.opensearch.cluster.routing.RoutingNode;
 import org.opensearch.cluster.routing.RoutingTable;
 import org.opensearch.cluster.routing.ShardRouting;
 import org.opensearch.cluster.routing.UnassignedInfo;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.common.util.CollectionUtils;
 import org.junit.Before;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import static java.util.Collections.singletonMap;
 import static org.opensearch.cluster.routing.ShardRoutingState.INITIALIZING;
 import static org.opensearch.cluster.routing.ShardRoutingState.STARTED;
 public abstract class OpenSearchAllocationWithConstraintsTestCase extends OpenSearchAllocationTestCase {
    protected MockAllocationService allocation;
    private ClusterState initialClusterState;
    protected ClusterState clusterState;
    private HashMap<String, Integer> indexShardCount;
    private HashMap<String, HashMap<String, Integer>> nodeShardsByIndex;
    private static final int MAX_REROUTE_STEPS_ALLOWED = 1500;
    @Before
    public void clearState() {
        allocation = null;
        initialClusterState = null;
        clusterState = null;
        indexShardCount = null;
        nodeShardsByIndex = null;
    }
    public static String shardIdentifierFromRouting(ShardRouting shardRouting) {
        return shardRouting.shardId().toString() + "[" + (shardRouting.primary() ? "p" : "r") + "]";
    }
    public void buildAllocationService() {
        Settings.Builder sb = Settings.builder();
        buildAllocationService(sb);
    }
    public void buildAllocationService(String excludeNodes) {
        logger.info("Excluding nodes: [{}]", excludeNodes);
        Settings.Builder sb = Settings.builder().put("cluster.routing.allocation.exclude._node_id", excludeNodes);
        buildAllocationService(sb);
    }
    public void buildZoneAwareAllocationService() {
        logger.info("Creating zone aware cluster");
        Settings.Builder sb = Settings.builder().put("cluster.routing.allocation.awareness.attributes", "zone");
        buildAllocationService(sb);
    }
    public void buildAllocationService(Settings.Builder sb) {
        sb.put("cluster.routing.allocation.node_concurrent_recoveries", 1);
        sb.put("cluster.routing.allocation.cluster_concurrent_rebalance", 1);
        allocation = createAllocationService(sb.build());
    }
    public Metadata buildMetadata(Metadata.Builder mb, String indexPrefix, int numberOfIndices, int numberOfShards,
            int numberOfReplicas) {
        for (int i = 0; i < numberOfIndices; i++) {
            mb.put(IndexMetadata.builder(indexPrefix + i)
                    .settings(settings(Version.CURRENT)
                            .put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "0"))
                    .numberOfShards(numberOfShards).numberOfReplicas(numberOfReplicas));
        }
        return mb.build();
    }
    public RoutingTable buildRoutingTable(RoutingTable.Builder rb, Metadata metadata, String indexPrefix,
            int numberOfIndices) {
        for (int i = 0; i < numberOfIndices; i++) {
            rb.addAsNew(metadata.index(indexPrefix + i));
        }
        return rb.build();
    }
    public DiscoveryNodes addNodes(DiscoveryNodes.Builder nb, String nodePrefix, int nodesAdded) {
        for (int i = 0; i < nodesAdded; i++) {
            nb.add(newNode(nodePrefix + i, singletonMap("_node_id", nodePrefix + i)));
        }
        return nb.build();
    }
    public void resetCluster() {
        clusterState = initialClusterState;
    }
    public void updateInitialCluster() {
        initialClusterState = clusterState;
    }
    public void createEmptyZoneAwareCluster(Map<String, Integer> nodesPerZone) {
        DiscoveryNodes.Builder nb = DiscoveryNodes.builder();
        int nodeId = 0;
        for (String zone : nodesPerZone.keySet()) {
            for (int i = 0; i < nodesPerZone.get(zone); i++) {
                nb.add(newNode("node_" + nodeId, singletonMap("zone", zone)));
                nodeId++;
            }
        }
        initialClusterState = ClusterState.builder(ClusterName.DEFAULT).nodes(nb.build()).build();
        clusterState = initialClusterState;
    }
    public void setupInitialCluster(int nodeCount, int indices, int shards, int replicas) {
        final String DEFAULT_INDEX_PREFIX = "index_";
        final String DEFAULT_NODE_PREFIX = "node_";
        Metadata metadata = buildMetadata(Metadata.builder(), DEFAULT_INDEX_PREFIX, indices, shards, replicas);
        RoutingTable routingTable = buildRoutingTable(RoutingTable.builder(), metadata, DEFAULT_INDEX_PREFIX, indices);
        DiscoveryNodes nodes = addNodes(DiscoveryNodes.builder(), DEFAULT_NODE_PREFIX, nodeCount);
        initialClusterState = ClusterState.builder(ClusterName.DEFAULT).metadata(metadata).routingTable(routingTable)
                .nodes(nodes).build();
        buildAllocationService();
        initialClusterState = allocateShardsAndBalance(initialClusterState);
        clusterState = initialClusterState;
        indexShardCount = new HashMap<>();
        for (int i = 0; i < indices; i++) {
            indexShardCount.put(DEFAULT_INDEX_PREFIX + i, shards * (replicas + 1));
        }
        assertForIndexShardHotSpots(false, nodeCount); // Initial cluster should be balanced
        logger.info("Initial cluster created...");
    }
    public void buildNodeShardsByIndex() {
        nodeShardsByIndex = new HashMap<>();
        for (RoutingNode rn : clusterState.getRoutingNodes()) {
            String node = rn.nodeId();
            nodeShardsByIndex.put(node, new HashMap<>());
            for (ShardRouting shard : rn) {
                assert shard.currentNodeId().equals(node);
                if (shard.state() == INITIALIZING || shard.state() == STARTED) {
                    nodeShardsByIndex.get(node).merge(shard.getIndexName(), 1, Integer::sum);
                }
            }
        }
    }
    public boolean isIndexHotSpotPresent(int nodes, List<String> nodeList) {
        for (String node : nodeList) {
            for (String index : nodeShardsByIndex.get(node).keySet()) {
                int count = nodeShardsByIndex.get(node).get(index);
                int limit = (int) Math.ceil(indexShardCount.get(index) / (float) nodes);
                if (count > limit) {
                    return true;
                }
            }
        }
        return false;
    }
    public List<String> getNodeList(String... nodesToValidate) {
        if (CollectionUtils.isEmpty(nodesToValidate)) {
            return Arrays.asList(clusterState.getNodes().resolveNodes());
        }
        return Arrays.asList(nodesToValidate);
    }
    public void assertForIndexShardHotSpots(boolean expected, int nodes, String... nodesToValidate) {
        List<String> nodeList = getNodeList(nodesToValidate);
        buildNodeShardsByIndex();
        assertEquals(expected, isIndexHotSpotPresent(nodes, nodeList));
    }
    public int allocateAndCheckIndexShardHotSpots(boolean expected, int nodes, String... nodesToValidate) {
        List<String> nodeList = getNodeList(nodesToValidate);
        boolean hasHotSpot = false;
        buildNodeShardsByIndex();
        List<ShardRouting> initShards;
        int movesToBalance = 0;
        do {
            assert movesToBalance <= MAX_REROUTE_STEPS_ALLOWED : "Could not balance cluster in max allowed moves";
            clusterState = allocation.applyStartedShards(clusterState,
                    clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
            clusterState = allocation.reroute(clusterState, "reroute");
            initShards = clusterState.getRoutingNodes().shardsWithState(INITIALIZING);
            for (ShardRouting shard : initShards) {
                String node = shard.currentNodeId();
                String index = shard.getIndexName();
                nodeShardsByIndex.get(node).merge(index, 1, Integer::sum);
                if (!nodeList.contains(node)) {
                    continue;
                }
                int count = nodeShardsByIndex.get(node).get(index);
                int limit = (int) Math.ceil(indexShardCount.get(index) / (float) nodes);
                if (count <= limit) {
                    continue;
                }
                /**
                 * Hot spots can occur due to the order in which shards get allocated to nodes.
                 * A node with fewer shards may not be able to accept current shard due to
                 * SameShardAllocationDecider, causing it to breach allocation constraint on
                 * another node. We need to differentiate between such hot spots v/s actual hot
                 * spots.
                 *
                 * A simple check could be to ensure there is no node with shards less than
                 * allocation limit, that can accept current shard. However, in current
                 * allocation algorithm, when nodes get throttled, shards are added to
                 * ModelNodes without adding them to actual cluster (RoutingNodes). As a result,
                 * the shards per node we see here, are different from the ones observed by
                 * weight function in balancer. RoutingNodes with {@link count} < {@link limit}
                 * may not have had the same count in the corresponding ModelNode seen by weight
                 * function. We hence use the following alternate check --
                 *
                 * Given the way {@link limit} is defined, we should not have hot spots if *all*
                 * nodes are eligible to accept the shard. A hot spot is acceptable, if either
                 * all peer nodes have {@link count} > {@link limit}, or if even one node is
                 * ineligible to accept the shard due to SameShardAllocationDecider, as this
                 * leads to a chain of events that breach IndexShardsPerNode constraint on all
                 * other nodes.
                 */
                // If all peer nodes have count >= limit, hotspot is acceptable
                boolean limitBreachedOnAllNodes = true;
                for (RoutingNode peerNode : clusterState.getRoutingNodes()) {
                    if (peerNode.nodeId().equals(node)) {
                        continue;
                    }
                    int peerCount = nodeShardsByIndex.get(peerNode.nodeId()).getOrDefault(index, 0);
                    if (peerCount < limit) {
                        limitBreachedOnAllNodes = false;
                    }
                }
                if (limitBreachedOnAllNodes) {
                    continue;
                }
                // If any node is ineligible to accept the shard, this hot spot is acceptable
                boolean peerHasSameShard = false;
                for (RoutingNode peerNode : clusterState.getRoutingNodes()) {
                    if (peerNode.nodeId().equals(node)) {
                        continue;
                    }
                    ShardRouting sameIdShardOnPeer = peerNode.getByShardId(shard.shardId());
                    if (sameIdShardOnPeer != null && sameIdShardOnPeer.getIndexName().equals(index)) {
                        peerHasSameShard = true;
                    }
                }
                if (peerHasSameShard) {
                    continue;
                }
                hasHotSpot = true;
            }
            movesToBalance++;
        } while (!initShards.isEmpty());
        logger.info("HotSpot: [{}], Moves to balance: [{}]", hasHotSpot, movesToBalance);
        assertEquals(expected, hasHotSpot);
        assertForIndexShardHotSpots(false, nodes); // Post re-balancing, cluster should always be hot spot free.
        return movesToBalance;
    }
    public ClusterState allocateShardsAndBalance(ClusterState clusterState) {
        int iterations = 0;
        do {
            clusterState = allocation.applyStartedShards(clusterState,
                    clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
            clusterState = allocation.reroute(clusterState, "reroute");
            iterations++;
        } while (!clusterState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty()
                && iterations < MAX_REROUTE_STEPS_ALLOWED);
        return clusterState;
    }
    public void addNodesWithoutIndexing(int nodeCount, String node_prefix) {
        DiscoveryNodes nodes = addNodes(DiscoveryNodes.builder(clusterState.nodes()), node_prefix, nodeCount);
        clusterState = ClusterState.builder(clusterState).nodes(nodes).build();
    }
    public void terminateNodes(String... nodesToTerminate) {
        if (CollectionUtils.isEmpty(nodesToTerminate)) {
            return;
        }
        DiscoveryNodes.Builder nb = DiscoveryNodes.builder(clusterState.nodes());
        Arrays.asList(nodesToTerminate).forEach(node -> nb.remove(node));
        clusterState = ClusterState.builder(clusterState).nodes(nb.build()).build();
        clusterState = allocation.disassociateDeadNodes(clusterState, true, "node-terminated");
        clusterState = allocateShardsAndBalance(clusterState);
    }
    public void addNodesWithIndexing(int nodeCount, String node_prefix, int indices, int shards, int replicas) {
        final String NEW_INDEX_PREFIX = "new_index_";
        Metadata md = buildMetadata(Metadata.builder(clusterState.getMetadata()), NEW_INDEX_PREFIX, indices, shards,
                replicas);
        RoutingTable rb = buildRoutingTable(RoutingTable.builder(clusterState.getRoutingTable()), md, NEW_INDEX_PREFIX,
                indices);
        DiscoveryNodes nodes = addNodes(DiscoveryNodes.builder(clusterState.nodes()), node_prefix, nodeCount);
        clusterState = ClusterState.builder(clusterState).metadata(md).routingTable(rb).nodes(nodes).build();
        for (int i = 0; i < indices; i++) {
            indexShardCount.put(NEW_INDEX_PREFIX + i, shards * (replicas + 1));
        }
    }
    public void addIndices(String index_prefix, int indices, int shards, int replicas) {
        Metadata md = buildMetadata(Metadata.builder(clusterState.getMetadata()), index_prefix, indices, shards,
                replicas);
        RoutingTable rb = buildRoutingTable(RoutingTable.builder(clusterState.getRoutingTable()), md, index_prefix,
                indices);
        clusterState = ClusterState.builder(clusterState).metadata(md).routingTable(rb).build();
        if (indexShardCount == null) {
            indexShardCount = new HashMap<>();
        }
        for (int i = 0; i < indices; i++) {
            indexShardCount.put(index_prefix + i, shards * (replicas + 1));
        }
    }
 }