Promote shadow replica to primary when initializing primary fails (#22021)

Failing an initializing primary when shadow replicas are enabled for the index can leave the primary unassigned with replicas being active. Instead, a replica should be promoted to primary, which is fixed by this commit.
This commit is contained in:
Yannick Welsch 2016-12-07 13:59:43 +01:00 committed by GitHub
parent 13e1a6fd40
commit 9630b1a6e7
1 changed files with 30 additions and 11 deletions

View File

@ -537,8 +537,22 @@ public class RoutingNodes implements Iterable<RoutingNode> {
// fail actual shard
if (failedShard.initializing()) {
if (failedShard.relocatingNodeId() == null) {
// initializing shard that is not relocation target, just move to unassigned
moveToUnassigned(failedShard, unassignedInfo);
if (failedShard.primary()) {
// promote active replica to primary if active replica exists (only the case for shadow replicas)
ShardRouting activeReplica = activeReplica(failedShard.shardId());
assert activeReplica == null || IndexMetaData.isIndexUsingShadowReplicas(indexMetaData.getSettings()) :
"initializing primary [" + failedShard + "] with active replicas [" + activeReplica + "] only expected when " +
"using shadow replicas";
if (activeReplica == null) {
moveToUnassigned(failedShard, unassignedInfo);
} else {
movePrimaryToUnassignedAndDemoteToReplica(failedShard, unassignedInfo);
promoteReplicaToPrimary(activeReplica, indexMetaData, routingChangesObserver);
}
} else {
// initializing shard that is not relocation target, just move to unassigned
moveToUnassigned(failedShard, unassignedInfo);
}
} else {
// The shard is a target of a relocating shard. In that case we only need to remove the target shard and cancel the source
// relocation. No shard is left unassigned
@ -561,16 +575,8 @@ public class RoutingNodes implements Iterable<RoutingNode> {
if (activeReplica == null) {
moveToUnassigned(failedShard, unassignedInfo);
} else {
// if the activeReplica was relocating before this call to failShard, its relocation was cancelled above when we
// failed initializing replica shards (and moved replica relocation source back to started)
assert activeReplica.started() : "replica relocation should have been cancelled: " + activeReplica;
movePrimaryToUnassignedAndDemoteToReplica(failedShard, unassignedInfo);
ShardRouting primarySwappedCandidate = promoteActiveReplicaShardToPrimary(activeReplica);
routingChangesObserver.replicaPromoted(activeReplica);
if (IndexMetaData.isIndexUsingShadowReplicas(indexMetaData.getSettings())) {
ShardRouting initializedShard = reinitShadowPrimary(primarySwappedCandidate);
routingChangesObserver.startedPrimaryReinitialized(primarySwappedCandidate, initializedShard);
}
promoteReplicaToPrimary(activeReplica, indexMetaData, routingChangesObserver);
}
} else {
assert failedShard.primary() == false;
@ -586,6 +592,19 @@ public class RoutingNodes implements Iterable<RoutingNode> {
" was matched but wasn't removed";
}
private void promoteReplicaToPrimary(ShardRouting activeReplica, IndexMetaData indexMetaData,
RoutingChangesObserver routingChangesObserver) {
// if the activeReplica was relocating before this call to failShard, its relocation was cancelled earlier when we
// failed initializing replica shards (and moved replica relocation source back to started)
assert activeReplica.started() : "replica relocation should have been cancelled: " + activeReplica;
ShardRouting primarySwappedCandidate = promoteActiveReplicaShardToPrimary(activeReplica);
routingChangesObserver.replicaPromoted(activeReplica);
if (IndexMetaData.isIndexUsingShadowReplicas(indexMetaData.getSettings())) {
ShardRouting initializedShard = reinitShadowPrimary(primarySwappedCandidate);
routingChangesObserver.startedPrimaryReinitialized(primarySwappedCandidate, initializedShard);
}
}
/**
* Mark a shard as started and adjusts internal statistics.
*