From 13e1a6fd40da65890b68dda5b78e2e5ae9cd7355 Mon Sep 17 00:00:00 2001 From: Yannick Welsch Date: Wed, 7 Dec 2016 10:59:11 +0100 Subject: [PATCH] Trim in-sync allocations set only when it grows (#21976) This commit makes two changes to how the in-sync allocations set is updated: - the set is only trimmed when it grows. This prevents trimming too eagerly when the number of replicas was decreased while shards were unassigned. - the allocation id of an active primary that failed is only removed from the in-sync set if another replica gets promoted to primary. This prevents the situation where the only available shard copy in the cluster gets removed the in-sync set. Closes #21719 --- .../allocation/IndexMetaDataUpdater.java | 24 +++--- .../allocation/InSyncAllocationIdTests.java | 82 +++++++++++++++++++ 2 files changed, 94 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/cluster/routing/allocation/IndexMetaDataUpdater.java b/core/src/main/java/org/elasticsearch/cluster/routing/allocation/IndexMetaDataUpdater.java index fa30a102bf6..b24a961829d 100644 --- a/core/src/main/java/org/elasticsearch/cluster/routing/allocation/IndexMetaDataUpdater.java +++ b/core/src/main/java/org/elasticsearch/cluster/routing/allocation/IndexMetaDataUpdater.java @@ -22,6 +22,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.MetaData; +import org.elasticsearch.cluster.routing.IndexShardRoutingTable; import org.elasticsearch.cluster.routing.RecoverySource; import org.elasticsearch.cluster.routing.RoutingChangesObserver; import org.elasticsearch.cluster.routing.RoutingTable; @@ -174,10 +175,13 @@ public class IndexMetaDataUpdater extends RoutingChangesObserver.AbstractRouting // Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary // but repeatedly shut down nodes that have active replicas. // We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set + // Only trim the set of allocation ids when it grows, otherwise we might trim too eagerly when the number + // of replicas was decreased while shards were unassigned. int maxActiveShards = oldIndexMetaData.getNumberOfReplicas() + 1; // +1 for the primary - if (inSyncAllocationIds.size() > maxActiveShards) { + IndexShardRoutingTable newShardRoutingTable = newRoutingTable.shardRoutingTable(shardId); + if (inSyncAllocationIds.size() > oldInSyncAllocationIds.size() && inSyncAllocationIds.size() > maxActiveShards) { // trim entries that have no corresponding shard routing in the cluster state (i.e. trim unavailable copies) - List assignedShards = newRoutingTable.shardRoutingTable(shardId).assignedShards(); + List assignedShards = newShardRoutingTable.assignedShards(); assert assignedShards.size() <= maxActiveShards : "cannot have more assigned shards " + assignedShards + " than maximum possible active shards " + maxActiveShards; Set assignedAllocations = assignedShards.stream().map(s -> s.allocationId().getId()).collect(Collectors.toSet()); @@ -187,16 +191,12 @@ public class IndexMetaDataUpdater extends RoutingChangesObserver.AbstractRouting .collect(Collectors.toSet()); } - // only update in-sync allocation ids if there is at least one entry remaining. Assume for example that there only - // ever was a primary active and now it failed. If we were to remove the allocation id from the in-sync set, this would - // create an empty primary on the next allocation (see ShardRouting#allocatedPostIndexCreate) - if (inSyncAllocationIds.isEmpty() && oldInSyncAllocationIds.isEmpty() == false) { - assert updates.firstFailedPrimary != null : - "in-sync set became empty but active primary wasn't failed: " + oldInSyncAllocationIds; - if (updates.firstFailedPrimary != null) { - // add back allocation id of failed primary - inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId()); - } + // only remove allocation id of failed active primary if there is at least one active shard remaining. Assume for example that + // the primary fails but there is no new primary to fail over to. If we were to remove the allocation id of the primary from the + // in-sync set, this could create an empty primary on the next allocation. + if (newShardRoutingTable.activeShards().isEmpty() && updates.firstFailedPrimary != null) { + // add back allocation id of failed primary + inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId()); } assert inSyncAllocationIds.isEmpty() == false || oldInSyncAllocationIds.isEmpty() : diff --git a/core/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java b/core/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java index 6c1813de082..616eff4381c 100644 --- a/core/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java +++ b/core/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java @@ -259,6 +259,88 @@ public class InSyncAllocationIdTests extends ESAllocationTestCase { assertThat(newInSyncSet, hasItem(primaryShard.allocationId().getId())); } + /** + * Only trim set of allocation ids when the set grows + */ + public void testInSyncIdsNotTrimmedWhenNotGrowing() throws Exception { + ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation); + + Set inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0); + assertThat(inSyncSet.size(), equalTo(2)); + + IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0); + ShardRouting primaryShard = shardRoutingTable.primaryShard(); + ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0); + + logger.info("remove replica node"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .remove(replicaShard.currentNodeId())) + .build(); + clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); + + // in-sync allocation ids should not be updated + assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); + + logger.info("remove primary node"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .remove(primaryShard.currentNodeId())) + .build(); + clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); + + // in-sync allocation ids should not be updated + assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); + + logger.info("decrease number of replicas to 0"); + clusterState = ClusterState.builder(clusterState) + .routingTable(RoutingTable.builder(clusterState.routingTable()).updateNumberOfReplicas(0, "test").build()) + .metaData(MetaData.builder(clusterState.metaData()).updateNumberOfReplicas(0, "test")).build(); + + logger.info("add back node 1"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add( + newNode("node1"))).build(); + clusterState = allocation.reroute(clusterState, "reroute"); + + assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(1)); + // in-sync allocation ids should not be updated + assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); + + logger.info("start primary shard"); + clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING)); + // in-sync allocation ids should not be updated + assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); + } + + /** + * Don't remove allocation id of failed active primary if there is no replica to promote as primary. + */ + public void testPrimaryAllocationIdNotRemovedFromInSyncSetWhenNoFailOver() throws Exception { + ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation); + + Set inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0); + assertThat(inSyncSet.size(), equalTo(2)); + + IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0); + ShardRouting primaryShard = shardRoutingTable.primaryShard(); + ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0); + + logger.info("remove replica node"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .remove(replicaShard.currentNodeId())) + .build(); + clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); + + // in-sync allocation ids should not be updated + assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); + + logger.info("fail primary shard"); + clusterState = failedClusterStateTaskExecutor.execute(clusterState, Collections.singletonList(new ShardEntry( + shardRoutingTable.shardId(), primaryShard.allocationId().getId(), 0L, "dummy", null))).resultingState; + + assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(0)); + // in-sync allocation ids should not be updated + assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); + } + private ClusterState createOnePrimaryOneReplicaClusterState(AllocationService allocation) { logger.info("creating an index with 1 shard, 1 replica"); MetaData metaData = MetaData.builder()