Trim in-sync allocations set only when it grows (#21976)

This commit makes two changes to how the in-sync allocations set is updated:

- the set is only trimmed when it grows. This prevents trimming too eagerly when the number of replicas was decreased while shards were unassigned.
- the allocation id of an active primary that failed is only removed from the in-sync set if another replica gets promoted to primary. This prevents the situation where the only available shard copy in the cluster gets removed the in-sync set.

Closes #21719
This commit is contained in:
Yannick Welsch 2016-12-07 10:59:11 +01:00 committed by GitHub
parent c746854e03
commit 13e1a6fd40
2 changed files with 94 additions and 12 deletions

View File

@ -22,6 +22,7 @@ package org.elasticsearch.cluster.routing.allocation;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.RecoverySource;
import org.elasticsearch.cluster.routing.RoutingChangesObserver;
import org.elasticsearch.cluster.routing.RoutingTable;
@ -174,10 +175,13 @@ public class IndexMetaDataUpdater extends RoutingChangesObserver.AbstractRouting
// Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary
// but repeatedly shut down nodes that have active replicas.
// We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set
// Only trim the set of allocation ids when it grows, otherwise we might trim too eagerly when the number
// of replicas was decreased while shards were unassigned.
int maxActiveShards = oldIndexMetaData.getNumberOfReplicas() + 1; // +1 for the primary
if (inSyncAllocationIds.size() > maxActiveShards) {
IndexShardRoutingTable newShardRoutingTable = newRoutingTable.shardRoutingTable(shardId);
if (inSyncAllocationIds.size() > oldInSyncAllocationIds.size() && inSyncAllocationIds.size() > maxActiveShards) {
// trim entries that have no corresponding shard routing in the cluster state (i.e. trim unavailable copies)
List<ShardRouting> assignedShards = newRoutingTable.shardRoutingTable(shardId).assignedShards();
List<ShardRouting> assignedShards = newShardRoutingTable.assignedShards();
assert assignedShards.size() <= maxActiveShards :
"cannot have more assigned shards " + assignedShards + " than maximum possible active shards " + maxActiveShards;
Set<String> assignedAllocations = assignedShards.stream().map(s -> s.allocationId().getId()).collect(Collectors.toSet());
@ -187,16 +191,12 @@ public class IndexMetaDataUpdater extends RoutingChangesObserver.AbstractRouting
.collect(Collectors.toSet());
}
// only update in-sync allocation ids if there is at least one entry remaining. Assume for example that there only
// ever was a primary active and now it failed. If we were to remove the allocation id from the in-sync set, this would
// create an empty primary on the next allocation (see ShardRouting#allocatedPostIndexCreate)
if (inSyncAllocationIds.isEmpty() && oldInSyncAllocationIds.isEmpty() == false) {
assert updates.firstFailedPrimary != null :
"in-sync set became empty but active primary wasn't failed: " + oldInSyncAllocationIds;
if (updates.firstFailedPrimary != null) {
// add back allocation id of failed primary
inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
}
// only remove allocation id of failed active primary if there is at least one active shard remaining. Assume for example that
// the primary fails but there is no new primary to fail over to. If we were to remove the allocation id of the primary from the
// in-sync set, this could create an empty primary on the next allocation.
if (newShardRoutingTable.activeShards().isEmpty() && updates.firstFailedPrimary != null) {
// add back allocation id of failed primary
inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
}
assert inSyncAllocationIds.isEmpty() == false || oldInSyncAllocationIds.isEmpty() :

View File

@ -259,6 +259,88 @@ public class InSyncAllocationIdTests extends ESAllocationTestCase {
assertThat(newInSyncSet, hasItem(primaryShard.allocationId().getId()));
}
/**
* Only trim set of allocation ids when the set grows
*/
public void testInSyncIdsNotTrimmedWhenNotGrowing() throws Exception {
ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0);
assertThat(inSyncSet.size(), equalTo(2));
IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
ShardRouting primaryShard = shardRoutingTable.primaryShard();
ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
logger.info("remove replica node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove(replicaShard.currentNodeId()))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
logger.info("remove primary node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove(primaryShard.currentNodeId()))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
logger.info("decrease number of replicas to 0");
clusterState = ClusterState.builder(clusterState)
.routingTable(RoutingTable.builder(clusterState.routingTable()).updateNumberOfReplicas(0, "test").build())
.metaData(MetaData.builder(clusterState.metaData()).updateNumberOfReplicas(0, "test")).build();
logger.info("add back node 1");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(
newNode("node1"))).build();
clusterState = allocation.reroute(clusterState, "reroute");
assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(1));
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
logger.info("start primary shard");
clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
}
/**
* Don't remove allocation id of failed active primary if there is no replica to promote as primary.
*/
public void testPrimaryAllocationIdNotRemovedFromInSyncSetWhenNoFailOver() throws Exception {
ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0);
assertThat(inSyncSet.size(), equalTo(2));
IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
ShardRouting primaryShard = shardRoutingTable.primaryShard();
ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
logger.info("remove replica node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove(replicaShard.currentNodeId()))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
logger.info("fail primary shard");
clusterState = failedClusterStateTaskExecutor.execute(clusterState, Collections.singletonList(new ShardEntry(
shardRoutingTable.shardId(), primaryShard.allocationId().getId(), 0L, "dummy", null))).resultingState;
assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(0));
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
}
private ClusterState createOnePrimaryOneReplicaClusterState(AllocationService allocation) {
logger.info("creating an index with 1 shard, 1 replica");
MetaData metaData = MetaData.builder()