BalancedShardAllocator code improvements (#20746)

This commit improves the logic flow of BalancedShardsAllocator in preparation for separating out components of this class to be used in the cluster allocation explain APIs. In particular, this commit: 1. Adds a minimum value for the index/shard balance factor settings (0.0) 2. Makes the Balancer data structures immutable and pre-calculated at construction time. 3. Removes difficult to follow labeled blocks / GOTOs 4. Better logic for skipping over the same replica set when one of the replicas received a NO decision 5. Separates the decision making logic for a single shard from the logic to iterate over all unassigned shards.
2016-10-05 14:23:25 -04:00 · 2016-10-05 14:23:25 -04:00 · 15950b71b8
parent 8e27d741c0
commit 15950b71b8
1 changed files with 134 additions and 111 deletions
--- a/core/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
@ -73,9 +73,9 @@ import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING;
 public class BalancedShardsAllocator extends AbstractComponent implements ShardsAllocator {

    public static final Setting<Float> INDEX_BALANCE_FACTOR_SETTING =
-        Setting.floatSetting("cluster.routing.allocation.balance.index", 0.55f, Property.Dynamic, Property.NodeScope);
+        Setting.floatSetting("cluster.routing.allocation.balance.index", 0.55f, 0.0f, Property.Dynamic, Property.NodeScope);
    public static final Setting<Float> SHARD_BALANCE_FACTOR_SETTING =
-        Setting.floatSetting("cluster.routing.allocation.balance.shard", 0.45f, Property.Dynamic, Property.NodeScope);
+        Setting.floatSetting("cluster.routing.allocation.balance.shard", 0.45f, 0.0f, Property.Dynamic, Property.NodeScope);
    public static final Setting<Float> THRESHOLD_SETTING =
        Setting.floatSetting("cluster.routing.allocation.balance.threshold", 1.0f, 0.0f,
            Property.Dynamic, Property.NodeScope);
@ -210,7 +210,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
     */
    public static class Balancer {
        private final Logger logger;
-        private final Map<String, ModelNode> nodes = new HashMap<>();
+        private final Map<String, ModelNode> nodes;
        private final RoutingAllocation allocation;
        private final RoutingNodes routingNodes;
        private final WeightFunction weight;
@ -218,6 +218,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
        private final float threshold;
        private final MetaData metaData;
        private final float avgShardsPerNode;
+        private final NodeSorter sorter;

        public Balancer(Logger logger, RoutingAllocation allocation, WeightFunction weight, float threshold) {
            this.logger = logger;
@ -227,7 +228,8 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
            this.routingNodes = allocation.routingNodes();
            this.metaData = allocation.metaData();
            avgShardsPerNode = ((float) metaData.getTotalNumberOfShards()) / routingNodes.size();
-            buildModelFromAssigned();
+            nodes = Collections.unmodifiableMap(buildModelFromAssigned());
+            sorter = newNodeSorter();
        }

        /**
@ -304,11 +306,10 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
        }

        public Map<DiscoveryNode, Float> weighShard(ShardRouting shard) {
-            final NodeSorter sorter = newNodeSorter();
            final ModelNode[] modelNodes = sorter.modelNodes;
            final float[] weights = sorter.weights;

-            buildWeightOrderedIndices(sorter);
+            buildWeightOrderedIndices();
            Map<DiscoveryNode, Float> nodes = new HashMap<>(modelNodes.length);
            float currentNodeWeight = 0.0f;
            for (int i = 0; i < modelNodes.length; i++) {
@ -332,20 +333,19 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
         * weight of the maximum node and the minimum node according to the
         * {@link WeightFunction}. This weight is calculated per index to
         * distribute shards evenly per index. The balancer tries to relocate
-         * shards only if the delta exceeds the threshold. If the default case
+         * shards only if the delta exceeds the threshold. In the default case
         * the threshold is set to <tt>1.0</tt> to enforce gaining relocation
         * only, or in other words relocations that move the weight delta closer
         * to <tt>0.0</tt>
         */
        private void balanceByWeights() {
-            final NodeSorter sorter = newNodeSorter();
            final AllocationDeciders deciders = allocation.deciders();
            final ModelNode[] modelNodes = sorter.modelNodes;
            final float[] weights = sorter.weights;
-            for (String index : buildWeightOrderedIndices(sorter)) {
+            for (String index : buildWeightOrderedIndices()) {
                IndexMetaData indexMetaData = metaData.index(index);

-                // find nodes that have a shard of this index or where shards of this index are allowed to stay
+                // find nodes that have a shard of this index or where shards of this index are allowed to be allocated to,
                // move these nodes to the front of modelNodes so that we can only balance based on these nodes
                int relevantNodes = 0;
                for (int i = 0; i < modelNodes.length; i++) {
@ -440,14 +440,14 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
         * allocations on added nodes from one index when the weight parameters
         * for global balance overrule the index balance at an intermediate
         * state. For example this can happen if we have 3 nodes and 3 indices
-         * with 3 shards and 1 shard. At the first stage all three nodes hold
-         * 2 shard for each index. now we add another node and the first index
-         * is balanced moving 3 two of the nodes over to the new node since it
+         * with 3 primary and 1 replica shards. At the first stage all three nodes hold
+         * 2 shard for each index. Now we add another node and the first index
+         * is balanced moving three shards from two of the nodes over to the new node since it
         * has no shards yet and global balance for the node is way below
         * average. To re-balance we need to move shards back eventually likely
         * to the nodes we relocated them from.
         */
-        private String[] buildWeightOrderedIndices(NodeSorter sorter) {
+        private String[] buildWeightOrderedIndices() {
            final String[] indices = allocation.routingTable().indicesRouting().keys().toArray(String.class);
            final float[] deltas = new float[indices.length];
            for (int i = 0; i < deltas.length; i++) {
@ -501,7 +501,6 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
            // Iterate over the started shards interleaving between nodes, and check if they can remain. In the presence of throttling
            // shard movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are
            // offloading the shards.
-            final NodeSorter sorter = newNodeSorter();
            for (Iterator<ShardRouting> it = allocation.routingNodes().nodeInterleavedShardIterator(); it.hasNext(); ) {
                ShardRouting shardRouting = it.next();
                // we can only move started shards...
@ -511,7 +510,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
                    RoutingNode routingNode = sourceNode.getRoutingNode();
                    Decision decision = allocation.deciders().canRemain(shardRouting, routingNode, allocation);
                    if (decision.type() == Decision.Type.NO) {
-                        moveShard(sorter, shardRouting, sourceNode, routingNode);
+                        moveShard(shardRouting, sourceNode, routingNode);
                    }
                }
            }
@ -520,7 +519,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
        /**
         * Move started shard to the minimal eligible node with respect to the weight function
         */
-        private void moveShard(NodeSorter sorter, ShardRouting shardRouting, ModelNode sourceNode, RoutingNode routingNode) {
+        private void moveShard(ShardRouting shardRouting, ModelNode sourceNode, RoutingNode routingNode) {
            logger.debug("[{}][{}] allocated on [{}], but can no longer be allocated on it, moving...", shardRouting.index(), shardRouting.id(), routingNode.node());
            sorter.reset(shardRouting.getIndexName());
            /*
@ -557,7 +556,8 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
         * on the target node which we respect during the allocation / balancing
         * process. In short, this method recreates the status-quo in the cluster.
         */
-        private void buildModelFromAssigned() {
+        private Map<String, ModelNode> buildModelFromAssigned() {
+            Map<String, ModelNode> nodes = new HashMap<>();
            for (RoutingNode rn : routingNodes) {
                ModelNode node = new ModelNode(rn);
                nodes.put(rn.nodeId(), node);
@ -572,6 +572,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
                    }
                }
            }
+            return nodes;
        }

        /**
@ -626,91 +627,37 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
            do {
                for (int i = 0; i < primaryLength; i++) {
                    ShardRouting shard = primary[i];
-                    if (!shard.primary()) {
-                        final Decision decision = deciders.canAllocate(shard, allocation);
-                        if (decision.type() == Type.NO) {
-                            UnassignedInfo.AllocationStatus allocationStatus = UnassignedInfo.AllocationStatus.fromDecision(decision);
-                            unassigned.ignoreShard(shard, allocationStatus, allocation.changes());
-                            while(i < primaryLength-1 && comparator.compare(primary[i], primary[i+1]) == 0) {
-                                unassigned.ignoreShard(primary[++i], allocationStatus, allocation.changes());
-                            }
-                            continue;
-                        } else {
+                    Tuple<Decision, ModelNode> allocationDecision = allocateUnassignedShard(shard, throttledNodes);
+                    final Decision decision = allocationDecision.v1();
+                    final ModelNode minNode = allocationDecision.v2();
+
+                    if (decision.type() == Type.YES) {
+                        if (logger.isTraceEnabled()) {
+                            logger.trace("Assigned shard [{}] to [{}]", shard, minNode.getNodeId());
+                        }
+
+                        final long shardSize = DiskThresholdDecider.getExpectedShardSize(shard, allocation,
+                            ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE);
+                        shard = routingNodes.initializeShard(shard, minNode.getNodeId(), null, shardSize, allocation.changes());
+                        minNode.addShard(shard);
+                        if (!shard.primary()) {
+                            // copy over the same replica shards to the secondary array so they will get allocated
+                            // in a subsequent iteration, allowing replicas of other shards to be allocated first
                            while(i < primaryLength-1 && comparator.compare(primary[i], primary[i+1]) == 0) {
                                secondary[secondaryLength++] = primary[++i];
                            }
                        }
-                    }
-                    assert !shard.assignedToNode() : shard;
-                    /* find an node with minimal weight we can allocate on*/
-                    float minWeight = Float.POSITIVE_INFINITY;
-                    ModelNode minNode = null;
-                    Decision decision = null;
-                    if (throttledNodes.size() < nodes.size()) {
-                        /* Don't iterate over an identity hashset here the
-                         * iteration order is different for each run and makes testing hard */
-                        for (ModelNode node : nodes.values()) {
-                            if (throttledNodes.contains(node)) {
-                                continue;
-                            }
-                            if (!node.containsShard(shard)) {
-                                // simulate weight if we would add shard to node
-                                float currentWeight = weight.weightShardAdded(this, node, shard.getIndexName());
-                                /*
-                                 * Unless the operation is not providing any gains we
-                                 * don't check deciders
-                                 */
-                                if (currentWeight <= minWeight) {
-                                    Decision currentDecision = deciders.canAllocate(shard, node.getRoutingNode(), allocation);
-                                    NOUPDATE:
-                                    if (currentDecision.type() == Type.YES || currentDecision.type() == Type.THROTTLE) {
-                                        if (currentWeight == minWeight) {
-                                            /*  we have an equal weight tie breaking:
-                                             *  1. if one decision is YES prefer it
-                                             *  2. prefer the node that holds the primary for this index with the next id in the ring ie.
-                                             *  for the 3 shards 2 replica case we try to build up:
-                                             *    1 2 0
-                                             *    2 0 1
-                                             *    0 1 2
-                                             *  such that if we need to tie-break we try to prefer the node holding a shard with the minimal id greater
-                                             *  than the id of the shard we need to assign. This works find when new indices are created since
-                                             *  primaries are added first and we only add one shard set a time in this algorithm.
-                                             */
-                                            if (currentDecision.type() == decision.type()) {
-                                                final int repId = shard.id();
-                                                final int nodeHigh = node.highestPrimary(shard.index().getName());
-                                                final int minNodeHigh = minNode.highestPrimary(shard.getIndexName());
-                                                if ((((nodeHigh > repId && minNodeHigh > repId) || (nodeHigh < repId && minNodeHigh < repId)) && (nodeHigh < minNodeHigh))
-                                                        || (nodeHigh > minNodeHigh && nodeHigh > repId && minNodeHigh < repId)) {
-                                                    // nothing to set here; the minNode, minWeight, and decision get set below
-                                                } else {
-                                                    break NOUPDATE;
-                                                }
-                                            } else if (currentDecision.type() != Type.YES) {
-                                                break NOUPDATE;
-                                            }
-                                        }
-                                        minNode = node;
-                                        minWeight = currentWeight;
-                                        decision = currentDecision;
-                                    }
-                                }
-                            }
+                    } else {
+                        // did *not* receive a YES decision
+                        if (logger.isTraceEnabled()) {
+                            logger.trace("No eligible node found to assign shard [{}] decision [{}]", shard, decision.type());
                        }
-                    }
-                    assert (decision == null) == (minNode == null);
-                    if (minNode != null) {
-                        final long shardSize = DiskThresholdDecider.getExpectedShardSize(shard, allocation,
-                            ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE);
-                        if (decision.type() == Type.YES) {
-                            if (logger.isTraceEnabled()) {
-                                logger.trace("Assigned shard [{}] to [{}]", shard, minNode.getNodeId());
-                            }

-                            shard = routingNodes.initializeShard(shard, minNode.getNodeId(), null, shardSize, allocation.changes());
-                            minNode.addShard(shard);
-                            continue; // don't add to ignoreUnassigned
-                        } else {
+                        if (minNode != null) {
+                            // throttle decision scenario
+                            assert decision.type() == Type.THROTTLE;
+                            final long shardSize = DiskThresholdDecider.getExpectedShardSize(shard, allocation,
+                                ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE);
                            minNode.addShard(shard.initialize(minNode.getNodeId(), null, shardSize));
                            final RoutingNode node = minNode.getRoutingNode();
                            final Decision.Type nodeLevelDecision = deciders.canAllocate(node, allocation).type();
@ -721,21 +668,19 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
                                assert nodeLevelDecision == Type.NO;
                                throttledNodes.add(minNode);
                            }
+                        } else {
+                            assert decision.type() == Type.NO;
+                            if (logger.isTraceEnabled()) {
+                                logger.trace("No Node found to assign shard [{}]", shard);
+                            }
                        }
-                        if (logger.isTraceEnabled()) {
-                            logger.trace("No eligible node found to assign shard [{}] decision [{}]", shard, decision.type());
-                        }
-                    } else if (logger.isTraceEnabled()) {
-                        logger.trace("No Node found to assign shard [{}]", shard);
-                    }
-                    assert decision == null || decision.type() == Type.THROTTLE;
-                    UnassignedInfo.AllocationStatus allocationStatus =
-                        decision == null ? UnassignedInfo.AllocationStatus.DECIDERS_NO :
-                                           UnassignedInfo.AllocationStatus.fromDecision(decision);
-                    unassigned.ignoreShard(shard, allocationStatus, allocation.changes());
-                    if (!shard.primary()) { // we could not allocate it and we are a replica - check if we can ignore the other replicas
-                        while(secondaryLength > 0 && comparator.compare(shard, secondary[secondaryLength-1]) == 0) {
-                            unassigned.ignoreShard(secondary[--secondaryLength], allocationStatus, allocation.changes());
+
+                        UnassignedInfo.AllocationStatus allocationStatus = UnassignedInfo.AllocationStatus.fromDecision(decision);
+                        unassigned.ignoreShard(shard, allocationStatus, allocation.changes());
+                        if (!shard.primary()) { // we could not allocate it and we are a replica - check if we can ignore the other replicas
+                            while(i < primaryLength-1 && comparator.compare(primary[i], primary[i+1]) == 0) {
+                                unassigned.ignoreShard(primary[++i], allocationStatus, allocation.changes());
+                            }
                        }
                    }
                }
@ -748,6 +693,84 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
            // clear everything we have either added it or moved to ignoreUnassigned
        }

+        /**
+         * Make a decision for allocating an unassigned shard.  This method returns a two values in a tuple: the
+         * first value is the {@link Decision} taken to allocate the unassigned shard, the second value is the
+         * {@link ModelNode} representing the node that the shard should be assigned to.  If the decision returned
+         * is of type {@link Type#NO}, then the assigned node will be null.
+         */
+        private Tuple<Decision, ModelNode> allocateUnassignedShard(final ShardRouting shard, final Set<ModelNode> throttledNodes) {
+            assert !shard.assignedToNode() : "not an unassigned shard: " + shard;
+            if (allocation.deciders().canAllocate(shard, allocation).type() == Type.NO) {
+                // NO decision for allocating the shard, irrespective of any particular node, so exit early
+                return Tuple.tuple(Decision.NO, null);
+            }
+
+            /* find an node with minimal weight we can allocate on*/
+            float minWeight = Float.POSITIVE_INFINITY;
+            ModelNode minNode = null;
+            Decision decision = null;
+            if (throttledNodes.size() < nodes.size()) {
+                    /* Don't iterate over an identity hashset here the
+                     * iteration order is different for each run and makes testing hard */
+                for (ModelNode node : nodes.values()) {
+                    if (throttledNodes.contains(node)) {
+                        continue;
+                    }
+                    if (!node.containsShard(shard)) {
+                        // simulate weight if we would add shard to node
+                        float currentWeight = weight.weightShardAdded(this, node, shard.getIndexName());
+                            /*
+                             * Unless the operation is not providing any gains we
+                             * don't check deciders
+                             */
+                        if (currentWeight <= minWeight) {
+                            Decision currentDecision = allocation.deciders().canAllocate(shard, node.getRoutingNode(), allocation);
+                            if (currentDecision.type() == Type.YES || currentDecision.type() == Type.THROTTLE) {
+                                final boolean updateMinNode;
+                                if (currentWeight == minWeight) {
+                                        /*  we have an equal weight tie breaking:
+                                         *  1. if one decision is YES prefer it
+                                         *  2. prefer the node that holds the primary for this index with the next id in the ring ie.
+                                         *  for the 3 shards 2 replica case we try to build up:
+                                         *    1 2 0
+                                         *    2 0 1
+                                         *    0 1 2
+                                         *  such that if we need to tie-break we try to prefer the node holding a shard with the minimal id greater
+                                         *  than the id of the shard we need to assign. This works find when new indices are created since
+                                         *  primaries are added first and we only add one shard set a time in this algorithm.
+                                         */
+                                    if (currentDecision.type() == decision.type()) {
+                                        final int repId = shard.id();
+                                        final int nodeHigh = node.highestPrimary(shard.index().getName());
+                                        final int minNodeHigh = minNode.highestPrimary(shard.getIndexName());
+                                        updateMinNode = ((((nodeHigh > repId && minNodeHigh > repId)
+                                                               || (nodeHigh < repId && minNodeHigh < repId))
+                                                              && (nodeHigh < minNodeHigh))
+                                                             || (nodeHigh > minNodeHigh && nodeHigh > repId && minNodeHigh < repId));
+                                    } else {
+                                        updateMinNode = currentDecision.type() == Type.YES;
+                                    }
+                                } else {
+                                    updateMinNode = true;
+                                }
+                                if (updateMinNode) {
+                                    minNode = node;
+                                    minWeight = currentWeight;
+                                    decision = currentDecision;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            if (decision == null) {
+                // decision was not set and a node was not assigned, so treat it as a NO decision
+                decision = Decision.NO;
+            }
+            return Tuple.tuple(decision, minNode);
+        }
+
        /**
         * Tries to find a relocation from the max node to the minimal node for an arbitrary shard of the given index on the
         * balance model. Iff this method returns a <code>true</code> the relocation has already been executed on the