From 13e1a6fd40da65890b68dda5b78e2e5ae9cd7355 Mon Sep 17 00:00:00 2001
From: Yannick Welsch <yannick@welsch.lu>
Date: Wed, 7 Dec 2016 10:59:11 +0100
Subject: [PATCH] Trim in-sync allocations set only when it grows (#21976)

This commit makes two changes to how the in-sync allocations set is updated:

- the set is only trimmed when it grows. This prevents trimming too eagerly when the number of replicas was decreased while shards were unassigned.
- the allocation id of an active primary that failed is only removed from the in-sync set if another replica gets promoted to primary. This prevents the situation where the only available shard copy in the cluster gets removed the in-sync set.

Closes #21719
---
 .../allocation/IndexMetaDataUpdater.java      | 24 +++---
 .../allocation/InSyncAllocationIdTests.java   | 82 +++++++++++++++++++
 2 files changed, 94 insertions(+), 12 deletions(-)
diff --git a/core/src/main/java/org/elasticsearch/cluster/routing/allocation/IndexMetaDataUpdater.java b/core/src/main/java/org/elasticsearch/cluster/routing/allocation/IndexMetaDataUpdater.java
index fa30a102bf6..b24a961829d 100644
--- a/core/src/main/java/org/elasticsearch/cluster/routing/allocation/IndexMetaDataUpdater.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/allocation/IndexMetaDataUpdater.java
@@ -22,6 +22,7 @@ package org.elasticsearch.cluster.routing.allocation;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.metadata.MetaData;
+import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
 import org.elasticsearch.cluster.routing.RecoverySource;
 import org.elasticsearch.cluster.routing.RoutingChangesObserver;
 import org.elasticsearch.cluster.routing.RoutingTable;
@@ -174,10 +175,13 @@ public class IndexMetaDataUpdater extends RoutingChangesObserver.AbstractRouting
             // Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary
             // but repeatedly shut down nodes that have active replicas.
             // We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set
+            // Only trim the set of allocation ids when it grows, otherwise we might trim too eagerly when the number
+            // of replicas was decreased while shards were unassigned.
             int maxActiveShards = oldIndexMetaData.getNumberOfReplicas() + 1; // +1 for the primary
-            if (inSyncAllocationIds.size() > maxActiveShards) {
+            IndexShardRoutingTable newShardRoutingTable = newRoutingTable.shardRoutingTable(shardId);
+            if (inSyncAllocationIds.size() > oldInSyncAllocationIds.size() && inSyncAllocationIds.size() > maxActiveShards) {
                 // trim entries that have no corresponding shard routing in the cluster state (i.e. trim unavailable copies)
-                List<ShardRouting> assignedShards = newRoutingTable.shardRoutingTable(shardId).assignedShards();
+                List<ShardRouting> assignedShards = newShardRoutingTable.assignedShards();
                 assert assignedShards.size() <= maxActiveShards :
                     "cannot have more assigned shards " + assignedShards + " than maximum possible active shards " + maxActiveShards;
                 Set<String> assignedAllocations = assignedShards.stream().map(s -> s.allocationId().getId()).collect(Collectors.toSet());
@@ -187,16 +191,12 @@ public class IndexMetaDataUpdater extends RoutingChangesObserver.AbstractRouting
                     .collect(Collectors.toSet());
             }
 
-            // only update in-sync allocation ids if there is at least one entry remaining. Assume for example that there only
-            // ever was a primary active and now it failed. If we were to remove the allocation id from the in-sync set, this would
-            // create an empty primary on the next allocation (see ShardRouting#allocatedPostIndexCreate)
-            if (inSyncAllocationIds.isEmpty() && oldInSyncAllocationIds.isEmpty() == false) {
-                assert updates.firstFailedPrimary != null :
-                    "in-sync set became empty but active primary wasn't failed: " + oldInSyncAllocationIds;
-                if (updates.firstFailedPrimary != null) {
-                    // add back allocation id of failed primary
-                    inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
-                }
+            // only remove allocation id of failed active primary if there is at least one active shard remaining. Assume for example that
+            // the primary fails but there is no new primary to fail over to. If we were to remove the allocation id of the primary from the
+            // in-sync set, this could create an empty primary on the next allocation.
+            if (newShardRoutingTable.activeShards().isEmpty() && updates.firstFailedPrimary != null) {
+                // add back allocation id of failed primary
+                inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
             }
 
             assert inSyncAllocationIds.isEmpty() == false || oldInSyncAllocationIds.isEmpty() :
diff --git a/core/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java b/core/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java
index 6c1813de082..616eff4381c 100644
--- a/core/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java
+++ b/core/src/test/java/org/elasticsearch/cluster/routing/allocation/InSyncAllocationIdTests.java
@@ -259,6 +259,88 @@ public class InSyncAllocationIdTests extends ESAllocationTestCase {
         assertThat(newInSyncSet, hasItem(primaryShard.allocationId().getId()));
     }
 
+    /**
+     * Only trim set of allocation ids when the set grows
+     */
+    public void testInSyncIdsNotTrimmedWhenNotGrowing() throws Exception {
+        ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
+
+        Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0);
+        assertThat(inSyncSet.size(), equalTo(2));
+
+        IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
+        ShardRouting primaryShard = shardRoutingTable.primaryShard();
+        ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
+
+        logger.info("remove replica node");
+        clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
+            .remove(replicaShard.currentNodeId()))
+            .build();
+        clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
+
+        // in-sync allocation ids should not be updated
+        assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
+
+        logger.info("remove primary node");
+        clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
+            .remove(primaryShard.currentNodeId()))
+            .build();
+        clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
+
+        // in-sync allocation ids should not be updated
+        assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
+
+        logger.info("decrease number of replicas to 0");
+        clusterState = ClusterState.builder(clusterState)
+            .routingTable(RoutingTable.builder(clusterState.routingTable()).updateNumberOfReplicas(0, "test").build())
+            .metaData(MetaData.builder(clusterState.metaData()).updateNumberOfReplicas(0, "test")).build();
+
+        logger.info("add back node 1");
+        clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(
+            newNode("node1"))).build();
+        clusterState = allocation.reroute(clusterState, "reroute");
+
+        assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(1));
+        // in-sync allocation ids should not be updated
+        assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
+
+        logger.info("start primary shard");
+        clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
+        // in-sync allocation ids should not be updated
+        assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
+    }
+
+    /**
+     * Don't remove allocation id of failed active primary if there is no replica to promote as primary.
+     */
+    public void testPrimaryAllocationIdNotRemovedFromInSyncSetWhenNoFailOver() throws Exception {
+        ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
+
+        Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0);
+        assertThat(inSyncSet.size(), equalTo(2));
+
+        IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
+        ShardRouting primaryShard = shardRoutingTable.primaryShard();
+        ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
+
+        logger.info("remove replica node");
+        clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
+            .remove(replicaShard.currentNodeId()))
+            .build();
+        clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
+
+        // in-sync allocation ids should not be updated
+        assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
+
+        logger.info("fail primary shard");
+        clusterState = failedClusterStateTaskExecutor.execute(clusterState, Collections.singletonList(new ShardEntry(
+            shardRoutingTable.shardId(), primaryShard.allocationId().getId(), 0L, "dummy", null))).resultingState;
+
+        assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(0));
+        // in-sync allocation ids should not be updated
+        assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
+    }
+
     private ClusterState createOnePrimaryOneReplicaClusterState(AllocationService allocation) {
         logger.info("creating an index with 1 shard, 1 replica");
         MetaData metaData = MetaData.builder()