Add node id to shard failure message (#28024)

This will help in the allocation explain API to figure out which node a shard was last allocated to before it failed.

Closes #28018
This commit is contained in:
Yannick Welsch 2017-12-29 17:40:28 +01:00 committed by GitHub
parent 100a7b1f01
commit 2603391c00
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 12 deletions

View File

@ -181,7 +181,8 @@ public class AllocationService extends AbstractComponent {
shardToFail.shardId(), shardToFail, failedShard); shardToFail.shardId(), shardToFail, failedShard);
} }
int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0; int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0;
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, failedShardEntry.getMessage(), String message = "failed shard on node [" + shardToFail.currentNodeId() + "]: " + failedShardEntry.getMessage();
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, message,
failedShardEntry.getFailure(), failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false, failedShardEntry.getFailure(), failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false,
AllocationStatus.NO_ATTEMPT); AllocationStatus.NO_ATTEMPT);
routingNodes.failShard(logger, failedShard, unassignedInfo, indexMetaData, allocation.changes()); routingNodes.failShard(logger, failedShard, unassignedInfo, indexMetaData, allocation.changes());

View File

@ -260,8 +260,8 @@ public class UnassignedInfoTests extends ESAllocationTestCase {
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(1)); assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo(), notNullValue()); assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo(), notNullValue());
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getReason(), equalTo(UnassignedInfo.Reason.ALLOCATION_FAILED)); assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getReason(), equalTo(UnassignedInfo.Reason.ALLOCATION_FAILED));
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getMessage(), equalTo("test fail")); assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getMessage(), equalTo("failed shard on node [" + shardToFail.currentNodeId() + "]: test fail"));
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getDetails(), equalTo("test fail")); assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getDetails(), equalTo("failed shard on node [" + shardToFail.currentNodeId() + "]: test fail"));
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getUnassignedTimeInMillis(), greaterThan(0L)); assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getUnassignedTimeInMillis(), greaterThan(0L));
} }

View File

@ -43,6 +43,7 @@ import java.util.List;
import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING; import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED; import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED; import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.not;
@ -98,7 +99,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
assertEquals(routingTable.index("idx").shards().size(), 1); assertEquals(routingTable.index("idx").shards().size(), 1);
assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), INITIALIZING); assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), INITIALIZING);
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), i+1); assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), i+1);
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom" + i); assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom" + i));
} }
// now we go and check that we are actually stick to unassigned on the next failure // now we go and check that we are actually stick to unassigned on the next failure
List<FailedShard> failedShards = Collections.singletonList( List<FailedShard> failedShards = Collections.singletonList(
@ -111,7 +112,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
assertEquals(routingTable.index("idx").shards().size(), 1); assertEquals(routingTable.index("idx").shards().size(), 1);
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), retries); assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), retries);
assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), UNASSIGNED); assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), UNASSIGNED);
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom"); assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
// manual resetting of retry count // manual resetting of retry count
newState = strategy.reroute(clusterState, new AllocationCommands(), false, true).getClusterState(); newState = strategy.reroute(clusterState, new AllocationCommands(), false, true).getClusterState();
@ -123,7 +124,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
assertEquals(routingTable.index("idx").shards().size(), 1); assertEquals(routingTable.index("idx").shards().size(), 1);
assertEquals(0, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations()); assertEquals(0, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
assertEquals(INITIALIZING, routingTable.index("idx").shard(0).shards().get(0).state()); assertEquals(INITIALIZING, routingTable.index("idx").shard(0).shards().get(0).state());
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom"); assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
// again fail it N-1 times // again fail it N-1 times
for (int i = 0; i < retries-1; i++) { for (int i = 0; i < retries-1; i++) {
@ -138,7 +139,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
assertEquals(routingTable.index("idx").shards().size(), 1); assertEquals(routingTable.index("idx").shards().size(), 1);
assertEquals(i + 1, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations()); assertEquals(i + 1, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
assertEquals(INITIALIZING, routingTable.index("idx").shard(0).shards().get(0).state()); assertEquals(INITIALIZING, routingTable.index("idx").shard(0).shards().get(0).state());
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom"); assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
} }
// now we go and check that we are actually stick to unassigned on the next failure // now we go and check that we are actually stick to unassigned on the next failure
@ -152,7 +153,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
assertEquals(routingTable.index("idx").shards().size(), 1); assertEquals(routingTable.index("idx").shards().size(), 1);
assertEquals(retries, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations()); assertEquals(retries, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
assertEquals(UNASSIGNED, routingTable.index("idx").shard(0).shards().get(0).state()); assertEquals(UNASSIGNED, routingTable.index("idx").shard(0).shards().get(0).state());
assertEquals("boom", routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage()); assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
} }
public void testFailedAllocation() { public void testFailedAllocation() {
@ -172,7 +173,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0); ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
assertEquals(unassignedPrimary.state(), INITIALIZING); assertEquals(unassignedPrimary.state(), INITIALIZING);
assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), i+1); assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), i+1);
assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "boom" + i); assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("boom" + i));
// MaxRetryAllocationDecider#canForceAllocatePrimary should return YES decisions because canAllocate returns YES here // MaxRetryAllocationDecider#canForceAllocatePrimary should return YES decisions because canAllocate returns YES here
assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary( assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0))); unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0)));
@ -190,7 +191,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0); ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), retries); assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), retries);
assertEquals(unassignedPrimary.state(), UNASSIGNED); assertEquals(unassignedPrimary.state(), UNASSIGNED);
assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "boom"); assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("boom"));
// MaxRetryAllocationDecider#canForceAllocatePrimary should return a NO decision because canAllocate returns NO here // MaxRetryAllocationDecider#canForceAllocatePrimary should return a NO decision because canAllocate returns NO here
assertEquals(Decision.NO, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary( assertEquals(Decision.NO, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0))); unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0)));
@ -212,7 +213,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0); ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), retries); assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), retries);
assertEquals(unassignedPrimary.state(), INITIALIZING); assertEquals(unassignedPrimary.state(), INITIALIZING);
assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "boom"); assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("boom"));
// bumped up the max retry count, so canForceAllocatePrimary should return a YES decision // bumped up the max retry count, so canForceAllocatePrimary should return a YES decision
assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary( assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
routingTable.index("idx").shard(0).shards().get(0), null, new RoutingAllocation(null, null, clusterState, null, 0))); routingTable.index("idx").shard(0).shards().get(0), null, new RoutingAllocation(null, null, clusterState, null, 0)));
@ -239,7 +240,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0); unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), 1); assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), 1);
assertEquals(unassignedPrimary.state(), UNASSIGNED); assertEquals(unassignedPrimary.state(), UNASSIGNED);
assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "ZOOOMG"); assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("ZOOOMG"));
// Counter reset, so MaxRetryAllocationDecider#canForceAllocatePrimary should return a YES decision // Counter reset, so MaxRetryAllocationDecider#canForceAllocatePrimary should return a YES decision
assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary( assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0))); unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0)));