Trim in-sync allocations set only when it grows (#21976)
This commit makes two changes to how the in-sync allocations set is updated: - the set is only trimmed when it grows. This prevents trimming too eagerly when the number of replicas was decreased while shards were unassigned. - the allocation id of an active primary that failed is only removed from the in-sync set if another replica gets promoted to primary. This prevents the situation where the only available shard copy in the cluster gets removed the in-sync set. Closes #21719
This commit is contained in:
parent
c746854e03
commit
13e1a6fd40
|
@ -22,6 +22,7 @@ package org.elasticsearch.cluster.routing.allocation;
|
|||
import org.elasticsearch.cluster.ClusterState;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.cluster.metadata.MetaData;
|
||||
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
|
||||
import org.elasticsearch.cluster.routing.RecoverySource;
|
||||
import org.elasticsearch.cluster.routing.RoutingChangesObserver;
|
||||
import org.elasticsearch.cluster.routing.RoutingTable;
|
||||
|
@ -174,10 +175,13 @@ public class IndexMetaDataUpdater extends RoutingChangesObserver.AbstractRouting
|
|||
// Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary
|
||||
// but repeatedly shut down nodes that have active replicas.
|
||||
// We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set
|
||||
// Only trim the set of allocation ids when it grows, otherwise we might trim too eagerly when the number
|
||||
// of replicas was decreased while shards were unassigned.
|
||||
int maxActiveShards = oldIndexMetaData.getNumberOfReplicas() + 1; // +1 for the primary
|
||||
if (inSyncAllocationIds.size() > maxActiveShards) {
|
||||
IndexShardRoutingTable newShardRoutingTable = newRoutingTable.shardRoutingTable(shardId);
|
||||
if (inSyncAllocationIds.size() > oldInSyncAllocationIds.size() && inSyncAllocationIds.size() > maxActiveShards) {
|
||||
// trim entries that have no corresponding shard routing in the cluster state (i.e. trim unavailable copies)
|
||||
List<ShardRouting> assignedShards = newRoutingTable.shardRoutingTable(shardId).assignedShards();
|
||||
List<ShardRouting> assignedShards = newShardRoutingTable.assignedShards();
|
||||
assert assignedShards.size() <= maxActiveShards :
|
||||
"cannot have more assigned shards " + assignedShards + " than maximum possible active shards " + maxActiveShards;
|
||||
Set<String> assignedAllocations = assignedShards.stream().map(s -> s.allocationId().getId()).collect(Collectors.toSet());
|
||||
|
@ -187,16 +191,12 @@ public class IndexMetaDataUpdater extends RoutingChangesObserver.AbstractRouting
|
|||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
// only update in-sync allocation ids if there is at least one entry remaining. Assume for example that there only
|
||||
// ever was a primary active and now it failed. If we were to remove the allocation id from the in-sync set, this would
|
||||
// create an empty primary on the next allocation (see ShardRouting#allocatedPostIndexCreate)
|
||||
if (inSyncAllocationIds.isEmpty() && oldInSyncAllocationIds.isEmpty() == false) {
|
||||
assert updates.firstFailedPrimary != null :
|
||||
"in-sync set became empty but active primary wasn't failed: " + oldInSyncAllocationIds;
|
||||
if (updates.firstFailedPrimary != null) {
|
||||
// add back allocation id of failed primary
|
||||
inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
|
||||
}
|
||||
// only remove allocation id of failed active primary if there is at least one active shard remaining. Assume for example that
|
||||
// the primary fails but there is no new primary to fail over to. If we were to remove the allocation id of the primary from the
|
||||
// in-sync set, this could create an empty primary on the next allocation.
|
||||
if (newShardRoutingTable.activeShards().isEmpty() && updates.firstFailedPrimary != null) {
|
||||
// add back allocation id of failed primary
|
||||
inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
|
||||
}
|
||||
|
||||
assert inSyncAllocationIds.isEmpty() == false || oldInSyncAllocationIds.isEmpty() :
|
||||
|
|
|
@ -259,6 +259,88 @@ public class InSyncAllocationIdTests extends ESAllocationTestCase {
|
|||
assertThat(newInSyncSet, hasItem(primaryShard.allocationId().getId()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Only trim set of allocation ids when the set grows
|
||||
*/
|
||||
public void testInSyncIdsNotTrimmedWhenNotGrowing() throws Exception {
|
||||
ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
|
||||
|
||||
Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0);
|
||||
assertThat(inSyncSet.size(), equalTo(2));
|
||||
|
||||
IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
|
||||
ShardRouting primaryShard = shardRoutingTable.primaryShard();
|
||||
ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
|
||||
|
||||
logger.info("remove replica node");
|
||||
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
|
||||
.remove(replicaShard.currentNodeId()))
|
||||
.build();
|
||||
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
|
||||
|
||||
// in-sync allocation ids should not be updated
|
||||
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
|
||||
|
||||
logger.info("remove primary node");
|
||||
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
|
||||
.remove(primaryShard.currentNodeId()))
|
||||
.build();
|
||||
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
|
||||
|
||||
// in-sync allocation ids should not be updated
|
||||
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
|
||||
|
||||
logger.info("decrease number of replicas to 0");
|
||||
clusterState = ClusterState.builder(clusterState)
|
||||
.routingTable(RoutingTable.builder(clusterState.routingTable()).updateNumberOfReplicas(0, "test").build())
|
||||
.metaData(MetaData.builder(clusterState.metaData()).updateNumberOfReplicas(0, "test")).build();
|
||||
|
||||
logger.info("add back node 1");
|
||||
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(
|
||||
newNode("node1"))).build();
|
||||
clusterState = allocation.reroute(clusterState, "reroute");
|
||||
|
||||
assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(1));
|
||||
// in-sync allocation ids should not be updated
|
||||
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
|
||||
|
||||
logger.info("start primary shard");
|
||||
clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
|
||||
// in-sync allocation ids should not be updated
|
||||
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
|
||||
}
|
||||
|
||||
/**
|
||||
* Don't remove allocation id of failed active primary if there is no replica to promote as primary.
|
||||
*/
|
||||
public void testPrimaryAllocationIdNotRemovedFromInSyncSetWhenNoFailOver() throws Exception {
|
||||
ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
|
||||
|
||||
Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0);
|
||||
assertThat(inSyncSet.size(), equalTo(2));
|
||||
|
||||
IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
|
||||
ShardRouting primaryShard = shardRoutingTable.primaryShard();
|
||||
ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
|
||||
|
||||
logger.info("remove replica node");
|
||||
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
|
||||
.remove(replicaShard.currentNodeId()))
|
||||
.build();
|
||||
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
|
||||
|
||||
// in-sync allocation ids should not be updated
|
||||
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
|
||||
|
||||
logger.info("fail primary shard");
|
||||
clusterState = failedClusterStateTaskExecutor.execute(clusterState, Collections.singletonList(new ShardEntry(
|
||||
shardRoutingTable.shardId(), primaryShard.allocationId().getId(), 0L, "dummy", null))).resultingState;
|
||||
|
||||
assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(0));
|
||||
// in-sync allocation ids should not be updated
|
||||
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
|
||||
}
|
||||
|
||||
private ClusterState createOnePrimaryOneReplicaClusterState(AllocationService allocation) {
|
||||
logger.info("creating an index with 1 shard, 1 replica");
|
||||
MetaData metaData = MetaData.builder()
|
||||
|
|
Loading…
Reference in New Issue