Primary shard failure with initializing replica shards can cause the replica shard to cause allocation failures
fixes #2592
This commit is contained in:
parent
a7bb3c29f2
commit
042a5d02d9
|
@ -20,7 +20,6 @@
|
||||||
package org.elasticsearch.cluster.routing.allocation;
|
package org.elasticsearch.cluster.routing.allocation;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import org.elasticsearch.ElasticSearchException;
|
import org.elasticsearch.ElasticSearchException;
|
||||||
import org.elasticsearch.ElasticSearchIllegalStateException;
|
import org.elasticsearch.ElasticSearchIllegalStateException;
|
||||||
import org.elasticsearch.cluster.ClusterState;
|
import org.elasticsearch.cluster.ClusterState;
|
||||||
|
@ -103,7 +102,7 @@ public class AllocationService extends AbstractComponent {
|
||||||
// shuffle the unassigned nodes, just so we won't have things like poison failed shards
|
// shuffle the unassigned nodes, just so we won't have things like poison failed shards
|
||||||
Collections.shuffle(routingNodes.unassigned());
|
Collections.shuffle(routingNodes.unassigned());
|
||||||
FailedRerouteAllocation allocation = new FailedRerouteAllocation(allocationDeciders, routingNodes, clusterState.nodes(), failedShard);
|
FailedRerouteAllocation allocation = new FailedRerouteAllocation(allocationDeciders, routingNodes, clusterState.nodes(), failedShard);
|
||||||
boolean changed = applyFailedShard(allocation, failedShard);
|
boolean changed = applyFailedShard(allocation, failedShard, true);
|
||||||
if (!changed) {
|
if (!changed) {
|
||||||
return new RoutingAllocation.Result(false, clusterState.routingTable(), allocation.explanation());
|
return new RoutingAllocation.Result(false, clusterState.routingTable(), allocation.explanation());
|
||||||
}
|
}
|
||||||
|
@ -165,7 +164,7 @@ public class AllocationService extends AbstractComponent {
|
||||||
|
|
||||||
// elect primaries *before* allocating unassigned, so backups of primaries that failed
|
// elect primaries *before* allocating unassigned, so backups of primaries that failed
|
||||||
// will be moved to primary state and not wait for primaries to be allocated and recovered (*from gateway*)
|
// will be moved to primary state and not wait for primaries to be allocated and recovered (*from gateway*)
|
||||||
changed |= electPrimaries(allocation.routingNodes());
|
changed |= electPrimariesAndUnassignDanglingReplicas(allocation);
|
||||||
|
|
||||||
if (!changed) {
|
if (!changed) {
|
||||||
return new RoutingAllocation.Result(false, clusterState.routingTable(), allocation.explanation());
|
return new RoutingAllocation.Result(false, clusterState.routingTable(), allocation.explanation());
|
||||||
|
@ -185,13 +184,13 @@ public class AllocationService extends AbstractComponent {
|
||||||
|
|
||||||
// elect primaries *before* allocating unassigned, so backups of primaries that failed
|
// elect primaries *before* allocating unassigned, so backups of primaries that failed
|
||||||
// will be moved to primary state and not wait for primaries to be allocated and recovered (*from gateway*)
|
// will be moved to primary state and not wait for primaries to be allocated and recovered (*from gateway*)
|
||||||
changed |= electPrimaries(allocation.routingNodes());
|
changed |= electPrimariesAndUnassignDanglingReplicas(allocation);
|
||||||
|
|
||||||
// now allocate all the unassigned to available nodes
|
// now allocate all the unassigned to available nodes
|
||||||
if (allocation.routingNodes().hasUnassigned()) {
|
if (allocation.routingNodes().hasUnassigned()) {
|
||||||
changed |= shardsAllocators.allocateUnassigned(allocation);
|
changed |= shardsAllocators.allocateUnassigned(allocation);
|
||||||
// elect primaries again, in case this is needed with unassigned allocation
|
// elect primaries again, in case this is needed with unassigned allocation
|
||||||
changed |= electPrimaries(allocation.routingNodes());
|
changed |= electPrimariesAndUnassignDanglingReplicas(allocation);
|
||||||
}
|
}
|
||||||
|
|
||||||
// move shards that no longer can be allocated
|
// move shards that no longer can be allocated
|
||||||
|
@ -242,8 +241,9 @@ public class AllocationService extends AbstractComponent {
|
||||||
return changed;
|
return changed;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean electPrimaries(RoutingNodes routingNodes) {
|
private boolean electPrimariesAndUnassignDanglingReplicas(RoutingAllocation allocation) {
|
||||||
boolean changed = false;
|
boolean changed = false;
|
||||||
|
RoutingNodes routingNodes = allocation.routingNodes();
|
||||||
for (MutableShardRouting shardEntry : routingNodes.unassigned()) {
|
for (MutableShardRouting shardEntry : routingNodes.unassigned()) {
|
||||||
if (shardEntry.primary() && !shardEntry.assignedToNode()) {
|
if (shardEntry.primary() && !shardEntry.assignedToNode()) {
|
||||||
boolean elected = false;
|
boolean elected = false;
|
||||||
|
@ -283,6 +283,29 @@ public class AllocationService extends AbstractComponent {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// go over and remove dangling replicas that are initializing, but we couldn't elect primary ones...
|
||||||
|
List<ShardRouting> shardsToFail = null;
|
||||||
|
for (MutableShardRouting shardEntry : routingNodes.unassigned()) {
|
||||||
|
if (shardEntry.primary() && !shardEntry.assignedToNode()) {
|
||||||
|
for (RoutingNode routingNode : routingNodes.nodesToShards().values()) {
|
||||||
|
for (MutableShardRouting shardEntry2 : routingNode.shards()) {
|
||||||
|
if (shardEntry.shardId().equals(shardEntry2.shardId()) && !shardEntry2.active()) {
|
||||||
|
changed = true;
|
||||||
|
if (shardsToFail == null) {
|
||||||
|
shardsToFail = new ArrayList<ShardRouting>();
|
||||||
|
}
|
||||||
|
shardsToFail.add(shardEntry2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (shardsToFail != null) {
|
||||||
|
for (ShardRouting shardToFail : shardsToFail) {
|
||||||
|
applyFailedShard(allocation, shardToFail, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
return changed;
|
return changed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -310,8 +333,7 @@ public class AllocationService extends AbstractComponent {
|
||||||
changed = true;
|
changed = true;
|
||||||
// now, go over all the shards routing on the node, and fail them
|
// now, go over all the shards routing on the node, and fail them
|
||||||
for (MutableShardRouting shardRouting : new ArrayList<MutableShardRouting>(node.shards())) {
|
for (MutableShardRouting shardRouting : new ArrayList<MutableShardRouting>(node.shards())) {
|
||||||
// we create a copy of the shard routing, since applyFailedShard assumes its a new copy
|
applyFailedShard(allocation, shardRouting, false);
|
||||||
applyFailedShard(allocation, shardRouting);
|
|
||||||
}
|
}
|
||||||
// its a dead node, remove it, note, its important to remove it *after* we apply failed shard
|
// its a dead node, remove it, note, its important to remove it *after* we apply failed shard
|
||||||
// since it relies on the fact that the RoutingNode exists in the list of nodes
|
// since it relies on the fact that the RoutingNode exists in the list of nodes
|
||||||
|
@ -372,7 +394,7 @@ public class AllocationService extends AbstractComponent {
|
||||||
* Applies the relevant logic to handle a failed shard. Returns <tt>true</tt> if changes happened that
|
* Applies the relevant logic to handle a failed shard. Returns <tt>true</tt> if changes happened that
|
||||||
* require relocation.
|
* require relocation.
|
||||||
*/
|
*/
|
||||||
private boolean applyFailedShard(RoutingAllocation allocation, ShardRouting failedShard) {
|
private boolean applyFailedShard(RoutingAllocation allocation, ShardRouting failedShard, boolean addToIgnoreList) {
|
||||||
// create a copy of the failed shard, since we assume we can change possible refernces to it without
|
// create a copy of the failed shard, since we assume we can change possible refernces to it without
|
||||||
// changing the state of failed shard
|
// changing the state of failed shard
|
||||||
failedShard = new ImmutableShardRouting(failedShard);
|
failedShard = new ImmutableShardRouting(failedShard);
|
||||||
|
@ -397,8 +419,10 @@ public class AllocationService extends AbstractComponent {
|
||||||
it.remove();
|
it.remove();
|
||||||
shardRouting.deassignNode();
|
shardRouting.deassignNode();
|
||||||
|
|
||||||
|
if (addToIgnoreList) {
|
||||||
// make sure we ignore this shard on the relevant node
|
// make sure we ignore this shard on the relevant node
|
||||||
allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
|
allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
|
||||||
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -433,8 +457,10 @@ public class AllocationService extends AbstractComponent {
|
||||||
dirty = true;
|
dirty = true;
|
||||||
shardRouting.cancelRelocation();
|
shardRouting.cancelRelocation();
|
||||||
it.remove();
|
it.remove();
|
||||||
|
if (addToIgnoreList) {
|
||||||
// make sure we ignore this shard on the relevant node
|
// make sure we ignore this shard on the relevant node
|
||||||
allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
|
allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
|
||||||
|
}
|
||||||
|
|
||||||
allocation.routingNodes().unassigned().add(new MutableShardRouting(failedShard.index(), failedShard.id(),
|
allocation.routingNodes().unassigned().add(new MutableShardRouting(failedShard.index(), failedShard.id(),
|
||||||
null, failedShard.primary(), ShardRoutingState.UNASSIGNED, failedShard.version() + 1));
|
null, failedShard.primary(), ShardRoutingState.UNASSIGNED, failedShard.version() + 1));
|
||||||
|
@ -469,8 +495,10 @@ public class AllocationService extends AbstractComponent {
|
||||||
MutableShardRouting shardRouting = it.next();
|
MutableShardRouting shardRouting = it.next();
|
||||||
if (shardRouting.equals(failedShard)) {
|
if (shardRouting.equals(failedShard)) {
|
||||||
dirty = true;
|
dirty = true;
|
||||||
|
if (addToIgnoreList) {
|
||||||
// make sure we ignore this shard on the relevant node
|
// make sure we ignore this shard on the relevant node
|
||||||
allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
|
allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
|
||||||
|
}
|
||||||
|
|
||||||
it.remove();
|
it.remove();
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.elasticsearch.cluster.metadata.MetaData;
|
||||||
import org.elasticsearch.cluster.routing.RoutingNodes;
|
import org.elasticsearch.cluster.routing.RoutingNodes;
|
||||||
import org.elasticsearch.cluster.routing.RoutingTable;
|
import org.elasticsearch.cluster.routing.RoutingTable;
|
||||||
import org.elasticsearch.cluster.routing.allocation.AllocationService;
|
import org.elasticsearch.cluster.routing.allocation.AllocationService;
|
||||||
|
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
|
||||||
import org.elasticsearch.common.logging.ESLogger;
|
import org.elasticsearch.common.logging.ESLogger;
|
||||||
import org.elasticsearch.common.logging.Loggers;
|
import org.elasticsearch.common.logging.Loggers;
|
||||||
import org.testng.annotations.Test;
|
import org.testng.annotations.Test;
|
||||||
|
@ -98,4 +99,51 @@ public class PrimaryElectionRoutingTests {
|
||||||
assertThat(routingTable.index("test").shard(0).primaryShard().currentNodeId(), equalTo("node2"));
|
assertThat(routingTable.index("test").shard(0).primaryShard().currentNodeId(), equalTo("node2"));
|
||||||
assertThat(routingTable.index("test").shard(0).replicaShards().get(0).currentNodeId(), equalTo("node3"));
|
assertThat(routingTable.index("test").shard(0).replicaShards().get(0).currentNodeId(), equalTo("node3"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRemovingInitializingReplicasIfPrimariesFails() {
|
||||||
|
AllocationService allocation = new AllocationService(settingsBuilder().put("cluster.routing.allocation.concurrent_recoveries", 10).build());
|
||||||
|
|
||||||
|
logger.info("Building initial routing table");
|
||||||
|
|
||||||
|
MetaData metaData = newMetaDataBuilder()
|
||||||
|
.put(newIndexMetaDataBuilder("test").numberOfShards(2).numberOfReplicas(1))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
RoutingTable routingTable = routingTable()
|
||||||
|
.addAsNew(metaData.index("test"))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
ClusterState clusterState = newClusterStateBuilder().metaData(metaData).routingTable(routingTable).build();
|
||||||
|
|
||||||
|
logger.info("Adding two nodes and performing rerouting");
|
||||||
|
clusterState = newClusterStateBuilder().state(clusterState).nodes(newNodesBuilder().put(newNode("node1")).put(newNode("node2"))).build();
|
||||||
|
RoutingAllocation.Result rerouteResult = allocation.reroute(clusterState);
|
||||||
|
clusterState = newClusterStateBuilder().state(clusterState).routingTable(rerouteResult.routingTable()).build();
|
||||||
|
|
||||||
|
logger.info("Start the primary shards");
|
||||||
|
RoutingNodes routingNodes = clusterState.routingNodes();
|
||||||
|
rerouteResult = allocation.applyStartedShards(clusterState, routingNodes.shardsWithState(INITIALIZING));
|
||||||
|
clusterState = newClusterStateBuilder().state(clusterState).routingTable(rerouteResult.routingTable()).build();
|
||||||
|
routingNodes = clusterState.routingNodes();
|
||||||
|
|
||||||
|
assertThat(routingNodes.shardsWithState(STARTED).size(), equalTo(2));
|
||||||
|
assertThat(routingNodes.shardsWithState(INITIALIZING).size(), equalTo(2));
|
||||||
|
|
||||||
|
// now, fail one node, while the replica is initializing, and it also holds a primary
|
||||||
|
logger.info("--> fail node with primary");
|
||||||
|
String nodeIdToFail = clusterState.routingTable().index("test").shard(0).primaryShard().currentNodeId();
|
||||||
|
String nodeIdRemaining = nodeIdToFail.equals("node1") ? "node2" : "node1";
|
||||||
|
clusterState = newClusterStateBuilder().state(clusterState).nodes(newNodesBuilder()
|
||||||
|
.put(newNode(nodeIdRemaining))
|
||||||
|
).build();
|
||||||
|
rerouteResult = allocation.reroute(clusterState);
|
||||||
|
clusterState = newClusterStateBuilder().state(clusterState).routingTable(rerouteResult.routingTable()).build();
|
||||||
|
routingNodes = clusterState.routingNodes();
|
||||||
|
|
||||||
|
assertThat(routingNodes.shardsWithState(STARTED).size(), equalTo(1));
|
||||||
|
assertThat(routingNodes.shardsWithState(INITIALIZING).size(), equalTo(1));
|
||||||
|
assertThat(routingNodes.node(nodeIdRemaining).shardsWithState(INITIALIZING).get(0).primary(), equalTo(true));
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue