Prevent ShardAllocator to modify the unassigned while running allocations

The unassinged list is used to make allocation decisions but is currently
modified during allocation runs which causes primaries to be throttled
during allocation. If this happens newly allocated indices can be stalled
for a long time turning a cluster into a RED state if concurrent relocations
and / or recoveries are happening.

Closes #3610
This commit is contained in:
Simon Willnauer 2013-09-03 14:38:04 +02:00
parent ca6c26c62d
commit b39961b2a6
3 changed files with 46 additions and 20 deletions

View File

@ -334,14 +334,13 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
return new NodeSorter(nodesArray(), weight, this);
}
private boolean initialize(RoutingNodes routing) {
private boolean initialize(RoutingNodes routing, List<MutableShardRouting> unassigned) {
if (logger.isTraceEnabled()) {
logger.trace("Start distributing Shards");
}
indices.addAll(allocation.routingTable().indicesRouting().keySet());
buildModelFromAssigned(routing.shards(assignedFilter));
return allocateUnassigned(allocation.routingNodes().unassigned(), allocation.routingNodes().ignoredUnassigned());
return allocateUnassigned(unassigned, routing.ignoredUnassigned());
}
/**
@ -366,8 +365,8 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
if (logger.isTraceEnabled()) {
logger.trace("Start balancing cluster");
}
boolean changed = initialize(allocation.routingNodes());
final TransactionalList<MutableShardRouting> unassigned = new TransactionalList<MutableShardRouting>(allocation.routingNodes().unassigned());
boolean changed = initialize(allocation.routingNodes(), unassigned);
NodeSorter sorter = newNodeSorter();
if (nodes.size() > 1) { /* skip if we only have one node */
for (String index : buildWeightOrderedIndidces(Operation.BALANCE, sorter)) {
@ -445,6 +444,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
}
}
}
unassigned.commit();
return changed;
}
@ -519,7 +519,8 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
if (logger.isTraceEnabled()) {
logger.trace("Try moving shard [{}] from [{}]", shard, node);
}
boolean changed = initialize(allocation.routingNodes());
final TransactionalList<MutableShardRouting> unassigned = new TransactionalList<MutableShardRouting>(allocation.routingNodes().unassigned());
boolean changed = initialize(allocation.routingNodes(), unassigned);
final ModelNode sourceNode = nodes.get(node.nodeId());
assert sourceNode != null;
@ -533,6 +534,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
* This is not guaranteed to be balanced after this operation we still try best effort to
* allocate on the minimal eligible node.
*/
for (ModelNode currentNode : nodes) {
if (currentNode.getNodeId().equals(node.nodeId())) {
continue;
@ -549,10 +551,11 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
if (logger.isTraceEnabled()) {
logger.trace("Moved shard [{}] to node [{}]", shard, currentNode.getNodeId());
}
return true;
changed = true;
break;
}
}
unassigned.commit();
return changed;
}
@ -1039,4 +1042,37 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
return weights[weights.length - 1] - weights[0];
}
}
/**
* A list that makes a full copy of the original list and applies all
* modification to the copied list once {@link TransactionalList#commit()}
* is called.
*
*/
@SuppressWarnings("serial")
private static final class TransactionalList<T> extends ArrayList<T> {
private final List<T> originalList;
private List<T> assertingList; // only with assert
TransactionalList(List<T> originalList) {
super(originalList);
assert copyAsseringList(originalList);
this.originalList = originalList;
}
private boolean copyAsseringList(List<T> orig) {
this.assertingList = new ArrayList<T>(orig);
return true;
}
public void commit() {
/* Ensure that the actual source list is not modified while
* the transaction is running */
assert assertingList.equals(originalList) : "The list was modified outside of the scope";
originalList.clear();
originalList.addAll(this);
}
}
}

View File

@ -74,16 +74,8 @@ public class ThrottlingAllocationDecider extends AllocationDecider {
@Override
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
if (shardRouting.primary()) {
boolean primaryUnassigned = false;
List<MutableShardRouting> unassigned = allocation.routingNodes().unassigned();
for (int i1 = 0; i1 < unassigned.size(); i1++) {
MutableShardRouting shard = unassigned.get(i1);
if (shard.shardId().equals(shardRouting.shardId())) {
primaryUnassigned = true;
break;
}
}
if (primaryUnassigned) {
assert shardRouting.unassigned() || shardRouting.active();
if (shardRouting.unassigned()) {
// primary is unassigned, means we are going to do recovery from gateway
// count *just the primary* currently doing recovery on the node and check against concurrent_recoveries
int primariesInRecovery = 0;

View File

@ -19,7 +19,6 @@
package org.elasticsearch.test.unit.cluster.routing.allocation;
import org.apache.lucene.util.LuceneTestCase;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.routing.RoutingTable;
@ -45,7 +44,6 @@ public class PreferPrimaryAllocationTests extends ElasticsearchTestCase {
private final ESLogger logger = Loggers.getLogger(PreferPrimaryAllocationTests.class);
@LuceneTestCase.AwaitsFix(bugUrl = "seems like unassigned is cleared so throttling for primaries is not properly working")
@Test
public void testPreferPrimaryAllocationOverReplicas() {
logger.info("create an allocation with 1 initial recoveries");