BalancedShardAllocator code improvements (#20746)

This commit improves the logic flow of BalancedShardsAllocator in
preparation for separating out components of this class to be used
in the cluster allocation explain APIs.  In particular, this commit:

 1. Adds a minimum value for the index/shard balance factor settings (0.0)
 2. Makes the Balancer data structures immutable and pre-calculated at
    construction time.
 3. Removes difficult to follow labeled blocks / GOTOs
 4. Better logic for skipping over the same replica set when one of
    the replicas received a NO decision
 5. Separates the decision making logic for a single shard from the logic
    to iterate over all unassigned shards.
This commit is contained in:
Ali Beyad 2016-10-05 14:23:25 -04:00 committed by GitHub
parent 8e27d741c0
commit 15950b71b8
1 changed files with 134 additions and 111 deletions

View File

@ -73,9 +73,9 @@ import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING;
public class BalancedShardsAllocator extends AbstractComponent implements ShardsAllocator {
public static final Setting<Float> INDEX_BALANCE_FACTOR_SETTING =
Setting.floatSetting("cluster.routing.allocation.balance.index", 0.55f, Property.Dynamic, Property.NodeScope);
Setting.floatSetting("cluster.routing.allocation.balance.index", 0.55f, 0.0f, Property.Dynamic, Property.NodeScope);
public static final Setting<Float> SHARD_BALANCE_FACTOR_SETTING =
Setting.floatSetting("cluster.routing.allocation.balance.shard", 0.45f, Property.Dynamic, Property.NodeScope);
Setting.floatSetting("cluster.routing.allocation.balance.shard", 0.45f, 0.0f, Property.Dynamic, Property.NodeScope);
public static final Setting<Float> THRESHOLD_SETTING =
Setting.floatSetting("cluster.routing.allocation.balance.threshold", 1.0f, 0.0f,
Property.Dynamic, Property.NodeScope);
@ -210,7 +210,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
*/
public static class Balancer {
private final Logger logger;
private final Map<String, ModelNode> nodes = new HashMap<>();
private final Map<String, ModelNode> nodes;
private final RoutingAllocation allocation;
private final RoutingNodes routingNodes;
private final WeightFunction weight;
@ -218,6 +218,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
private final float threshold;
private final MetaData metaData;
private final float avgShardsPerNode;
private final NodeSorter sorter;
public Balancer(Logger logger, RoutingAllocation allocation, WeightFunction weight, float threshold) {
this.logger = logger;
@ -227,7 +228,8 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
this.routingNodes = allocation.routingNodes();
this.metaData = allocation.metaData();
avgShardsPerNode = ((float) metaData.getTotalNumberOfShards()) / routingNodes.size();
buildModelFromAssigned();
nodes = Collections.unmodifiableMap(buildModelFromAssigned());
sorter = newNodeSorter();
}
/**
@ -304,11 +306,10 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
}
public Map<DiscoveryNode, Float> weighShard(ShardRouting shard) {
final NodeSorter sorter = newNodeSorter();
final ModelNode[] modelNodes = sorter.modelNodes;
final float[] weights = sorter.weights;
buildWeightOrderedIndices(sorter);
buildWeightOrderedIndices();
Map<DiscoveryNode, Float> nodes = new HashMap<>(modelNodes.length);
float currentNodeWeight = 0.0f;
for (int i = 0; i < modelNodes.length; i++) {
@ -332,20 +333,19 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
* weight of the maximum node and the minimum node according to the
* {@link WeightFunction}. This weight is calculated per index to
* distribute shards evenly per index. The balancer tries to relocate
* shards only if the delta exceeds the threshold. If the default case
* shards only if the delta exceeds the threshold. In the default case
* the threshold is set to <tt>1.0</tt> to enforce gaining relocation
* only, or in other words relocations that move the weight delta closer
* to <tt>0.0</tt>
*/
private void balanceByWeights() {
final NodeSorter sorter = newNodeSorter();
final AllocationDeciders deciders = allocation.deciders();
final ModelNode[] modelNodes = sorter.modelNodes;
final float[] weights = sorter.weights;
for (String index : buildWeightOrderedIndices(sorter)) {
for (String index : buildWeightOrderedIndices()) {
IndexMetaData indexMetaData = metaData.index(index);
// find nodes that have a shard of this index or where shards of this index are allowed to stay
// find nodes that have a shard of this index or where shards of this index are allowed to be allocated to,
// move these nodes to the front of modelNodes so that we can only balance based on these nodes
int relevantNodes = 0;
for (int i = 0; i < modelNodes.length; i++) {
@ -440,14 +440,14 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
* allocations on added nodes from one index when the weight parameters
* for global balance overrule the index balance at an intermediate
* state. For example this can happen if we have 3 nodes and 3 indices
* with 3 shards and 1 shard. At the first stage all three nodes hold
* 2 shard for each index. now we add another node and the first index
* is balanced moving 3 two of the nodes over to the new node since it
* with 3 primary and 1 replica shards. At the first stage all three nodes hold
* 2 shard for each index. Now we add another node and the first index
* is balanced moving three shards from two of the nodes over to the new node since it
* has no shards yet and global balance for the node is way below
* average. To re-balance we need to move shards back eventually likely
* to the nodes we relocated them from.
*/
private String[] buildWeightOrderedIndices(NodeSorter sorter) {
private String[] buildWeightOrderedIndices() {
final String[] indices = allocation.routingTable().indicesRouting().keys().toArray(String.class);
final float[] deltas = new float[indices.length];
for (int i = 0; i < deltas.length; i++) {
@ -501,7 +501,6 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
// Iterate over the started shards interleaving between nodes, and check if they can remain. In the presence of throttling
// shard movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are
// offloading the shards.
final NodeSorter sorter = newNodeSorter();
for (Iterator<ShardRouting> it = allocation.routingNodes().nodeInterleavedShardIterator(); it.hasNext(); ) {
ShardRouting shardRouting = it.next();
// we can only move started shards...
@ -511,7 +510,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
RoutingNode routingNode = sourceNode.getRoutingNode();
Decision decision = allocation.deciders().canRemain(shardRouting, routingNode, allocation);
if (decision.type() == Decision.Type.NO) {
moveShard(sorter, shardRouting, sourceNode, routingNode);
moveShard(shardRouting, sourceNode, routingNode);
}
}
}
@ -520,7 +519,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
/**
* Move started shard to the minimal eligible node with respect to the weight function
*/
private void moveShard(NodeSorter sorter, ShardRouting shardRouting, ModelNode sourceNode, RoutingNode routingNode) {
private void moveShard(ShardRouting shardRouting, ModelNode sourceNode, RoutingNode routingNode) {
logger.debug("[{}][{}] allocated on [{}], but can no longer be allocated on it, moving...", shardRouting.index(), shardRouting.id(), routingNode.node());
sorter.reset(shardRouting.getIndexName());
/*
@ -557,7 +556,8 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
* on the target node which we respect during the allocation / balancing
* process. In short, this method recreates the status-quo in the cluster.
*/
private void buildModelFromAssigned() {
private Map<String, ModelNode> buildModelFromAssigned() {
Map<String, ModelNode> nodes = new HashMap<>();
for (RoutingNode rn : routingNodes) {
ModelNode node = new ModelNode(rn);
nodes.put(rn.nodeId(), node);
@ -572,6 +572,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
}
}
}
return nodes;
}
/**
@ -626,91 +627,37 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
do {
for (int i = 0; i < primaryLength; i++) {
ShardRouting shard = primary[i];
if (!shard.primary()) {
final Decision decision = deciders.canAllocate(shard, allocation);
if (decision.type() == Type.NO) {
UnassignedInfo.AllocationStatus allocationStatus = UnassignedInfo.AllocationStatus.fromDecision(decision);
unassigned.ignoreShard(shard, allocationStatus, allocation.changes());
while(i < primaryLength-1 && comparator.compare(primary[i], primary[i+1]) == 0) {
unassigned.ignoreShard(primary[++i], allocationStatus, allocation.changes());
}
continue;
} else {
Tuple<Decision, ModelNode> allocationDecision = allocateUnassignedShard(shard, throttledNodes);
final Decision decision = allocationDecision.v1();
final ModelNode minNode = allocationDecision.v2();
if (decision.type() == Type.YES) {
if (logger.isTraceEnabled()) {
logger.trace("Assigned shard [{}] to [{}]", shard, minNode.getNodeId());
}
final long shardSize = DiskThresholdDecider.getExpectedShardSize(shard, allocation,
ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE);
shard = routingNodes.initializeShard(shard, minNode.getNodeId(), null, shardSize, allocation.changes());
minNode.addShard(shard);
if (!shard.primary()) {
// copy over the same replica shards to the secondary array so they will get allocated
// in a subsequent iteration, allowing replicas of other shards to be allocated first
while(i < primaryLength-1 && comparator.compare(primary[i], primary[i+1]) == 0) {
secondary[secondaryLength++] = primary[++i];
}
}
}
assert !shard.assignedToNode() : shard;
/* find an node with minimal weight we can allocate on*/
float minWeight = Float.POSITIVE_INFINITY;
ModelNode minNode = null;
Decision decision = null;
if (throttledNodes.size() < nodes.size()) {
/* Don't iterate over an identity hashset here the
* iteration order is different for each run and makes testing hard */
for (ModelNode node : nodes.values()) {
if (throttledNodes.contains(node)) {
continue;
}
if (!node.containsShard(shard)) {
// simulate weight if we would add shard to node
float currentWeight = weight.weightShardAdded(this, node, shard.getIndexName());
/*
* Unless the operation is not providing any gains we
* don't check deciders
*/
if (currentWeight <= minWeight) {
Decision currentDecision = deciders.canAllocate(shard, node.getRoutingNode(), allocation);
NOUPDATE:
if (currentDecision.type() == Type.YES || currentDecision.type() == Type.THROTTLE) {
if (currentWeight == minWeight) {
/* we have an equal weight tie breaking:
* 1. if one decision is YES prefer it
* 2. prefer the node that holds the primary for this index with the next id in the ring ie.
* for the 3 shards 2 replica case we try to build up:
* 1 2 0
* 2 0 1
* 0 1 2
* such that if we need to tie-break we try to prefer the node holding a shard with the minimal id greater
* than the id of the shard we need to assign. This works find when new indices are created since
* primaries are added first and we only add one shard set a time in this algorithm.
*/
if (currentDecision.type() == decision.type()) {
final int repId = shard.id();
final int nodeHigh = node.highestPrimary(shard.index().getName());
final int minNodeHigh = minNode.highestPrimary(shard.getIndexName());
if ((((nodeHigh > repId && minNodeHigh > repId) || (nodeHigh < repId && minNodeHigh < repId)) && (nodeHigh < minNodeHigh))
|| (nodeHigh > minNodeHigh && nodeHigh > repId && minNodeHigh < repId)) {
// nothing to set here; the minNode, minWeight, and decision get set below
} else {
break NOUPDATE;
}
} else if (currentDecision.type() != Type.YES) {
break NOUPDATE;
}
}
minNode = node;
minWeight = currentWeight;
decision = currentDecision;
}
}
}
} else {
// did *not* receive a YES decision
if (logger.isTraceEnabled()) {
logger.trace("No eligible node found to assign shard [{}] decision [{}]", shard, decision.type());
}
}
assert (decision == null) == (minNode == null);
if (minNode != null) {
final long shardSize = DiskThresholdDecider.getExpectedShardSize(shard, allocation,
ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE);
if (decision.type() == Type.YES) {
if (logger.isTraceEnabled()) {
logger.trace("Assigned shard [{}] to [{}]", shard, minNode.getNodeId());
}
shard = routingNodes.initializeShard(shard, minNode.getNodeId(), null, shardSize, allocation.changes());
minNode.addShard(shard);
continue; // don't add to ignoreUnassigned
} else {
if (minNode != null) {
// throttle decision scenario
assert decision.type() == Type.THROTTLE;
final long shardSize = DiskThresholdDecider.getExpectedShardSize(shard, allocation,
ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE);
minNode.addShard(shard.initialize(minNode.getNodeId(), null, shardSize));
final RoutingNode node = minNode.getRoutingNode();
final Decision.Type nodeLevelDecision = deciders.canAllocate(node, allocation).type();
@ -721,21 +668,19 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
assert nodeLevelDecision == Type.NO;
throttledNodes.add(minNode);
}
} else {
assert decision.type() == Type.NO;
if (logger.isTraceEnabled()) {
logger.trace("No Node found to assign shard [{}]", shard);
}
}
if (logger.isTraceEnabled()) {
logger.trace("No eligible node found to assign shard [{}] decision [{}]", shard, decision.type());
}
} else if (logger.isTraceEnabled()) {
logger.trace("No Node found to assign shard [{}]", shard);
}
assert decision == null || decision.type() == Type.THROTTLE;
UnassignedInfo.AllocationStatus allocationStatus =
decision == null ? UnassignedInfo.AllocationStatus.DECIDERS_NO :
UnassignedInfo.AllocationStatus.fromDecision(decision);
unassigned.ignoreShard(shard, allocationStatus, allocation.changes());
if (!shard.primary()) { // we could not allocate it and we are a replica - check if we can ignore the other replicas
while(secondaryLength > 0 && comparator.compare(shard, secondary[secondaryLength-1]) == 0) {
unassigned.ignoreShard(secondary[--secondaryLength], allocationStatus, allocation.changes());
UnassignedInfo.AllocationStatus allocationStatus = UnassignedInfo.AllocationStatus.fromDecision(decision);
unassigned.ignoreShard(shard, allocationStatus, allocation.changes());
if (!shard.primary()) { // we could not allocate it and we are a replica - check if we can ignore the other replicas
while(i < primaryLength-1 && comparator.compare(primary[i], primary[i+1]) == 0) {
unassigned.ignoreShard(primary[++i], allocationStatus, allocation.changes());
}
}
}
}
@ -748,6 +693,84 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
// clear everything we have either added it or moved to ignoreUnassigned
}
/**
* Make a decision for allocating an unassigned shard. This method returns a two values in a tuple: the
* first value is the {@link Decision} taken to allocate the unassigned shard, the second value is the
* {@link ModelNode} representing the node that the shard should be assigned to. If the decision returned
* is of type {@link Type#NO}, then the assigned node will be null.
*/
private Tuple<Decision, ModelNode> allocateUnassignedShard(final ShardRouting shard, final Set<ModelNode> throttledNodes) {
assert !shard.assignedToNode() : "not an unassigned shard: " + shard;
if (allocation.deciders().canAllocate(shard, allocation).type() == Type.NO) {
// NO decision for allocating the shard, irrespective of any particular node, so exit early
return Tuple.tuple(Decision.NO, null);
}
/* find an node with minimal weight we can allocate on*/
float minWeight = Float.POSITIVE_INFINITY;
ModelNode minNode = null;
Decision decision = null;
if (throttledNodes.size() < nodes.size()) {
/* Don't iterate over an identity hashset here the
* iteration order is different for each run and makes testing hard */
for (ModelNode node : nodes.values()) {
if (throttledNodes.contains(node)) {
continue;
}
if (!node.containsShard(shard)) {
// simulate weight if we would add shard to node
float currentWeight = weight.weightShardAdded(this, node, shard.getIndexName());
/*
* Unless the operation is not providing any gains we
* don't check deciders
*/
if (currentWeight <= minWeight) {
Decision currentDecision = allocation.deciders().canAllocate(shard, node.getRoutingNode(), allocation);
if (currentDecision.type() == Type.YES || currentDecision.type() == Type.THROTTLE) {
final boolean updateMinNode;
if (currentWeight == minWeight) {
/* we have an equal weight tie breaking:
* 1. if one decision is YES prefer it
* 2. prefer the node that holds the primary for this index with the next id in the ring ie.
* for the 3 shards 2 replica case we try to build up:
* 1 2 0
* 2 0 1
* 0 1 2
* such that if we need to tie-break we try to prefer the node holding a shard with the minimal id greater
* than the id of the shard we need to assign. This works find when new indices are created since
* primaries are added first and we only add one shard set a time in this algorithm.
*/
if (currentDecision.type() == decision.type()) {
final int repId = shard.id();
final int nodeHigh = node.highestPrimary(shard.index().getName());
final int minNodeHigh = minNode.highestPrimary(shard.getIndexName());
updateMinNode = ((((nodeHigh > repId && minNodeHigh > repId)
|| (nodeHigh < repId && minNodeHigh < repId))
&& (nodeHigh < minNodeHigh))
|| (nodeHigh > minNodeHigh && nodeHigh > repId && minNodeHigh < repId));
} else {
updateMinNode = currentDecision.type() == Type.YES;
}
} else {
updateMinNode = true;
}
if (updateMinNode) {
minNode = node;
minWeight = currentWeight;
decision = currentDecision;
}
}
}
}
}
}
if (decision == null) {
// decision was not set and a node was not assigned, so treat it as a NO decision
decision = Decision.NO;
}
return Tuple.tuple(decision, minNode);
}
/**
* Tries to find a relocation from the max node to the minimal node for an arbitrary shard of the given index on the
* balance model. Iff this method returns a <code>true</code> the relocation has already been executed on the