Merge pull request #15016 from jasontedor/shard-failure-batch
Use general cluster state batching mechanism for shard failures
This commit is contained in:
commit
182c22f23f
|
@ -20,9 +20,7 @@
|
|||
package org.elasticsearch.cluster.action.shard;
|
||||
|
||||
import org.elasticsearch.ExceptionsHelper;
|
||||
import org.elasticsearch.cluster.ClusterService;
|
||||
import org.elasticsearch.cluster.ClusterState;
|
||||
import org.elasticsearch.cluster.ClusterStateUpdateTask;
|
||||
import org.elasticsearch.cluster.*;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||
import org.elasticsearch.cluster.routing.RoutingService;
|
||||
|
@ -46,7 +44,6 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
|
||||
import static org.elasticsearch.cluster.routing.ShardRouting.readShardRoutingEntry;
|
||||
|
||||
|
@ -64,7 +61,6 @@ public class ShardStateAction extends AbstractComponent {
|
|||
private final RoutingService routingService;
|
||||
|
||||
private final BlockingQueue<ShardRoutingEntry> startedShardsQueue = ConcurrentCollections.newBlockingQueue();
|
||||
private final BlockingQueue<ShardRoutingEntry> failedShardQueue = ConcurrentCollections.newBlockingQueue();
|
||||
|
||||
@Inject
|
||||
public ShardStateAction(Settings settings, ClusterService clusterService, TransportService transportService,
|
||||
|
@ -141,54 +137,52 @@ public class ShardStateAction extends AbstractComponent {
|
|||
});
|
||||
}
|
||||
|
||||
private final ShardFailedClusterStateHandler shardFailedClusterStateHandler = new ShardFailedClusterStateHandler();
|
||||
|
||||
private void handleShardFailureOnMaster(final ShardRoutingEntry shardRoutingEntry) {
|
||||
logger.warn("{} received shard failed for {}", shardRoutingEntry.failure, shardRoutingEntry.shardRouting.shardId(), shardRoutingEntry);
|
||||
failedShardQueue.add(shardRoutingEntry);
|
||||
clusterService.submitStateUpdateTask("shard-failed (" + shardRoutingEntry.shardRouting + "), message [" + shardRoutingEntry.message + "]",
|
||||
new ClusterStateUpdateTask(Priority.HIGH) {
|
||||
clusterService.submitStateUpdateTask(
|
||||
"shard-failed (" + shardRoutingEntry.shardRouting + "), message [" + shardRoutingEntry.message + "]",
|
||||
shardRoutingEntry,
|
||||
ClusterStateTaskConfig.build(Priority.HIGH),
|
||||
shardFailedClusterStateHandler,
|
||||
shardFailedClusterStateHandler);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ClusterState execute(ClusterState currentState) {
|
||||
if (shardRoutingEntry.processed) {
|
||||
return currentState;
|
||||
}
|
||||
|
||||
List<ShardRoutingEntry> shardRoutingEntries = new ArrayList<>();
|
||||
failedShardQueue.drainTo(shardRoutingEntries);
|
||||
|
||||
// nothing to process (a previous event has processed it already)
|
||||
if (shardRoutingEntries.isEmpty()) {
|
||||
return currentState;
|
||||
}
|
||||
|
||||
List<FailedRerouteAllocation.FailedShard> shardRoutingsToBeApplied = new ArrayList<>(shardRoutingEntries.size());
|
||||
|
||||
// mark all entries as processed
|
||||
for (ShardRoutingEntry entry : shardRoutingEntries) {
|
||||
entry.processed = true;
|
||||
shardRoutingsToBeApplied.add(new FailedRerouteAllocation.FailedShard(entry.shardRouting, entry.message, entry.failure));
|
||||
}
|
||||
|
||||
RoutingAllocation.Result routingResult = allocationService.applyFailedShards(currentState, shardRoutingsToBeApplied);
|
||||
if (!routingResult.changed()) {
|
||||
return currentState;
|
||||
}
|
||||
return ClusterState.builder(currentState).routingResult(routingResult).build();
|
||||
class ShardFailedClusterStateHandler implements ClusterStateTaskExecutor<ShardRoutingEntry>, ClusterStateTaskListener {
|
||||
@Override
|
||||
public BatchResult<ShardRoutingEntry> execute(ClusterState currentState, List<ShardRoutingEntry> tasks) throws Exception {
|
||||
BatchResult.Builder<ShardRoutingEntry> batchResultBuilder = BatchResult.builder();
|
||||
List<FailedRerouteAllocation.FailedShard> shardRoutingsToBeApplied = new ArrayList<>(tasks.size());
|
||||
for (ShardRoutingEntry task : tasks) {
|
||||
task.processed = true;
|
||||
shardRoutingsToBeApplied.add(new FailedRerouteAllocation.FailedShard(task.shardRouting, task.message, task.failure));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onFailure(String source, Throwable t) {
|
||||
logger.error("unexpected failure during [{}]", t, source);
|
||||
ClusterState maybeUpdatedState = currentState;
|
||||
try {
|
||||
RoutingAllocation.Result result = allocationService.applyFailedShards(currentState, shardRoutingsToBeApplied);
|
||||
if (result.changed()) {
|
||||
maybeUpdatedState = ClusterState.builder(currentState).routingResult(result).build();
|
||||
}
|
||||
batchResultBuilder.successes(tasks);
|
||||
} catch (Throwable t) {
|
||||
batchResultBuilder.failures(tasks, t);
|
||||
}
|
||||
return batchResultBuilder.build(maybeUpdatedState);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
|
||||
@Override
|
||||
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
|
||||
if (oldState != newState && newState.getRoutingNodes().unassigned().size() > 0) {
|
||||
logger.trace("unassigned shards after shard failures. scheduling a reroute.");
|
||||
routingService.reroute("unassigned shards after shard failures, scheduling a reroute");
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onFailure(String source, Throwable t) {
|
||||
logger.error("unexpected failure during [{}]", t, source);
|
||||
}
|
||||
}
|
||||
|
||||
private void shardStartedOnMaster(final ShardRoutingEntry shardRoutingEntry) {
|
||||
|
|
Loading…
Reference in New Issue