Merge pull request #15016 from jasontedor/shard-failure-batch

Use general cluster state batching mechanism for shard failures
2015-12-03 14:02:48 -05:00 · 2015-12-03 14:02:48 -05:00 · 182c22f23f
parent d2e07f21e0 b70d97f36b
commit 182c22f23f
1 changed files with 37 additions and 43 deletions
--- a/core/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java
+++ b/core/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java
@ -20,9 +20,7 @@
 package org.elasticsearch.cluster.action.shard;

 import org.elasticsearch.ExceptionsHelper;
-import org.elasticsearch.cluster.ClusterService;
-import org.elasticsearch.cluster.ClusterState;
-import org.elasticsearch.cluster.ClusterStateUpdateTask;
+import org.elasticsearch.cluster.*;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.routing.RoutingService;
@ -46,7 +44,6 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.BlockingQueue;
-import java.util.concurrent.CountDownLatch;

 import static org.elasticsearch.cluster.routing.ShardRouting.readShardRoutingEntry;

@ -64,7 +61,6 @@ public class ShardStateAction extends AbstractComponent {
    private final RoutingService routingService;

    private final BlockingQueue<ShardRoutingEntry> startedShardsQueue = ConcurrentCollections.newBlockingQueue();
-    private final BlockingQueue<ShardRoutingEntry> failedShardQueue = ConcurrentCollections.newBlockingQueue();

    @Inject
    public ShardStateAction(Settings settings, ClusterService clusterService, TransportService transportService,
@ -141,54 +137,52 @@ public class ShardStateAction extends AbstractComponent {
                });
    }

+    private final ShardFailedClusterStateHandler shardFailedClusterStateHandler = new ShardFailedClusterStateHandler();
+
    private void handleShardFailureOnMaster(final ShardRoutingEntry shardRoutingEntry) {
        logger.warn("{} received shard failed for {}", shardRoutingEntry.failure, shardRoutingEntry.shardRouting.shardId(), shardRoutingEntry);
-        failedShardQueue.add(shardRoutingEntry);
-        clusterService.submitStateUpdateTask("shard-failed (" + shardRoutingEntry.shardRouting + "), message [" + shardRoutingEntry.message + "]",
-                new ClusterStateUpdateTask(Priority.HIGH) {
+        clusterService.submitStateUpdateTask(
+                "shard-failed (" + shardRoutingEntry.shardRouting + "), message [" + shardRoutingEntry.message + "]",
+                shardRoutingEntry,
+                ClusterStateTaskConfig.build(Priority.HIGH),
+                shardFailedClusterStateHandler,
+                shardFailedClusterStateHandler);
+    }

-            @Override
-            public ClusterState execute(ClusterState currentState) {
-                if (shardRoutingEntry.processed) {
-                    return currentState;
-                }
-
-                List<ShardRoutingEntry> shardRoutingEntries = new ArrayList<>();
-                failedShardQueue.drainTo(shardRoutingEntries);
-
-                // nothing to process (a previous event has processed it already)
-                if (shardRoutingEntries.isEmpty()) {
-                    return currentState;
-                }
-
-                List<FailedRerouteAllocation.FailedShard> shardRoutingsToBeApplied = new ArrayList<>(shardRoutingEntries.size());
-
-                // mark all entries as processed
-                for (ShardRoutingEntry entry : shardRoutingEntries) {
-                    entry.processed = true;
-                    shardRoutingsToBeApplied.add(new FailedRerouteAllocation.FailedShard(entry.shardRouting, entry.message, entry.failure));
-                }
-
-                RoutingAllocation.Result routingResult = allocationService.applyFailedShards(currentState, shardRoutingsToBeApplied);
-                if (!routingResult.changed()) {
-                    return currentState;
-                }
-                return ClusterState.builder(currentState).routingResult(routingResult).build();
+    class ShardFailedClusterStateHandler implements ClusterStateTaskExecutor<ShardRoutingEntry>, ClusterStateTaskListener {
+        @Override
+        public BatchResult<ShardRoutingEntry> execute(ClusterState currentState, List<ShardRoutingEntry> tasks) throws Exception {
+            BatchResult.Builder<ShardRoutingEntry> batchResultBuilder = BatchResult.builder();
+            List<FailedRerouteAllocation.FailedShard> shardRoutingsToBeApplied = new ArrayList<>(tasks.size());
+            for (ShardRoutingEntry task : tasks) {
+                task.processed = true;
+                shardRoutingsToBeApplied.add(new FailedRerouteAllocation.FailedShard(task.shardRouting, task.message, task.failure));
            }
-
-            @Override
-            public void onFailure(String source, Throwable t) {
-                logger.error("unexpected failure during [{}]", t, source);
+            ClusterState maybeUpdatedState = currentState;
+            try {
+                RoutingAllocation.Result result = allocationService.applyFailedShards(currentState, shardRoutingsToBeApplied);
+                if (result.changed()) {
+                    maybeUpdatedState = ClusterState.builder(currentState).routingResult(result).build();
+                }
+                batchResultBuilder.successes(tasks);
+            } catch (Throwable t) {
+                batchResultBuilder.failures(tasks, t);
            }
+            return batchResultBuilder.build(maybeUpdatedState);
+        }

-            @Override
-            public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
+        @Override
+        public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
                if (oldState != newState && newState.getRoutingNodes().unassigned().size() > 0) {
                    logger.trace("unassigned shards after shard failures. scheduling a reroute.");
                    routingService.reroute("unassigned shards after shard failures, scheduling a reroute");
                }
-            }
-        });
+        }
+
+        @Override
+        public void onFailure(String source, Throwable t) {
+            logger.error("unexpected failure during [{}]", t, source);
+        }
    }

    private void shardStartedOnMaster(final ShardRoutingEntry shardRoutingEntry) {