Failure to recover properly on node(s) restart

When a node restarts, it might be canceling one recovery of a shard id only to get another one in the next cycle. We should detect this case and handle it properly. This is a fix to the annoying message seen by users: suspect illegal state: trying to move shard from primary mode to replica mode.
2012-06-22 17:46:57 +02:00 · 2012-06-22 17:46:57 +02:00 · 1780a2a067
parent cc3fab45ff
commit 1780a2a067
7 changed files with 129 additions and 16 deletions
--- a/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java
+++ b/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java
@ -294,6 +294,7 @@ public class AllocationService extends AbstractComponent {
                // we know this since it has a relocating node id (the node we relocate from) and our state is INITIALIZING (and not RELOCATING)
                boolean isRelocationDestinationShard = relocatingNodeId != null && shardRoutingEntry.initializing();
                boolean remove = false;
                boolean currentNodeIsDead = false;
                if (!liveNodeIds.contains(shardRoutingEntry.currentNodeId())) {
                    changed = true;
@ -305,7 +306,7 @@ public class AllocationService extends AbstractComponent {
                    shardRoutingEntry.deassignNode();
                    currentNodeIsDead = true;
-                    shardsIterator.remove();
+                    remove = true;
                }
                // move source shard back to active state and cancel relocation mode.
@ -319,6 +320,10 @@ public class AllocationService extends AbstractComponent {
                if (isRelocationDestinationShard && !liveNodeIds.contains(relocatingNodeId)) {
                    changed = true;
                    remove = true;
                }
                if (remove) {
                    shardsIterator.remove();
                }
            }
--- a/src/main/java/org/elasticsearch/index/shard/ShardId.java
+++ b/src/main/java/org/elasticsearch/index/shard/ShardId.java
@ -102,6 +102,7 @@ public class ShardId implements Serializable, Streamable {
    public void readFrom(StreamInput in) throws IOException {
        index = Index.readIndexName(in);
        shardId = in.readVInt();
        hashCode = computeHashCode();
    }
    @Override
--- a/src/main/java/org/elasticsearch/index/shard/service/InternalIndexShard.java
+++ b/src/main/java/org/elasticsearch/index/shard/service/InternalIndexShard.java
@ -217,7 +217,7 @@ public class InternalIndexShard extends AbstractIndexShardComponent implements I
        }
        if (currentRouting != null) {
            if (!shardRouting.primary() && currentRouting.primary()) {
-                logger.warn("suspect illegal state: trying to move shard from primary mode to backup mode");
+                logger.warn("suspect illegal state: trying to move shard from primary mode to replica mode");
            }
            // if its the same routing, return
            if (currentRouting.equals(shardRouting)) {
--- a/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java
+++ b/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java
@ -493,6 +493,32 @@ public class IndicesClusterStateService extends AbstractLifecycleComponent<Indic
                continue;
            }
            if (indexService.hasShard(shardId)) {
                InternalIndexShard indexShard = (InternalIndexShard) indexService.shard(shardId);
                if (!shardRouting.equals(indexShard.routingEntry())) {
                    ShardRouting currentRoutingEntry = indexShard.routingEntry();
                    boolean needToDeleteCurrentShard = false;
                    if (currentRoutingEntry.initializing() && shardRouting.initializing()) {
                        // both are initializing, see if they are different instanceof of the shard routing, so they got switched on us
                        if (currentRoutingEntry.primary() && !shardRouting.primary()) {
                            needToDeleteCurrentShard = true;
                        }
                        // recovering from different nodes..., restart recovery
                        if (currentRoutingEntry.relocatingNodeId() != null && shardRouting.relocatingNodeId() != null &&
                                !currentRoutingEntry.relocatingNodeId().equals(shardRouting.relocatingNodeId())) {
                            needToDeleteCurrentShard = true;
                        }
                    }
                    if (needToDeleteCurrentShard) {
                        if (logger.isDebugEnabled()) {
                            logger.debug("[{}][{}] removing shard (different instance of it allocated on this node)", shardRouting.index(), shardRouting.id());
                        }
                        recoveryTarget.cancelRecovery(shardRouting.shardId());
                        indexService.removeShard(shardRouting.id(), "removing shard (different instance of it allocated on this node)");
                    }
                }
            }
            if (indexService.hasShard(shardId)) {
                InternalIndexShard indexShard = (InternalIndexShard) indexService.shard(shardId);
                if (!shardRouting.equals(indexShard.routingEntry())) {
--- a/src/main/java/org/elasticsearch/indices/recovery/RecoveryStatus.java
+++ b/src/main/java/org/elasticsearch/indices/recovery/RecoveryStatus.java
@ -39,6 +39,10 @@ public class RecoveryStatus {
        DONE
    }
    volatile Thread recoveryThread;
    volatile boolean canceled;
    volatile boolean sentCanceledToSource;
    ConcurrentMap<String, IndexOutput> openIndexOutputs = ConcurrentCollections.newConcurrentMap();
    ConcurrentMap<String, String> checksums = ConcurrentCollections.newConcurrentMap();
--- a/src/main/java/org/elasticsearch/indices/recovery/RecoveryTarget.java
+++ b/src/main/java/org/elasticsearch/indices/recovery/RecoveryTarget.java
@ -117,6 +117,34 @@ public class RecoveryTarget extends AbstractComponent {
        return peerRecoveryStatus;
    }
    public void cancelRecovery(ShardId shardId) {
        RecoveryStatus recoveryStatus = onGoingRecoveries.get(shardId);
        // it might be if the recovery source got canceled first
        if (recoveryStatus == null) {
            return;
        }
        if (recoveryStatus.sentCanceledToSource) {
            return;
        }
        recoveryStatus.canceled = true;
        if (recoveryStatus.recoveryThread != null) {
            recoveryStatus.recoveryThread.interrupt();
        }
        long time = System.currentTimeMillis();
        // give it a grace period of actually getting the sent ack part
        while (!recoveryStatus.sentCanceledToSource) {
            try {
                Thread.sleep(100);
            } catch (InterruptedException e) {
                // ignore
            }
            if (System.currentTimeMillis() - time > 10000) {
                break;
            }
        }
        removeAndCleanOnGoingRecovery(shardId);
    }
    public void startRecovery(final StartRecoveryRequest request, final boolean fromRetry, final RecoveryListener listener) {
        if (request.sourceNode() == null) {
            listener.onIgnoreRecovery(false, "No node to recover from, retry on next cluster state update");
@ -170,6 +198,7 @@ public class RecoveryTarget extends AbstractComponent {
            recovery = new RecoveryStatus();
            onGoingRecoveries.put(request.shardId(), recovery);
        }
        recovery.recoveryThread = Thread.currentThread();
        try {
            logger.trace("[{}][{}] starting recovery from {}", request.shardId().index().name(), request.shardId().id(), request.sourceNode());
@ -207,6 +236,11 @@ public class RecoveryTarget extends AbstractComponent {
            listener.onRecoveryDone();
        } catch (Exception e) {
 //            logger.trace("[{}][{}] Got exception on recovery", e, request.shardId().index().name(), request.shardId().id());
            if (recovery.canceled) {
                // don't remove it, the cancellation code will remove it...
                listener.onIgnoreRecovery(false, "canceled recovery");
                return;
            }
            if (shard.state() == IndexShardState.CLOSED) {
                removeAndCleanOnGoingRecovery(request.shardId());
                listener.onIgnoreRecovery(false, "local shard closed, stop recovery");
@ -274,6 +308,9 @@ public class RecoveryTarget extends AbstractComponent {
        // clean it from the on going recoveries since it is being closed
        RecoveryStatus peerRecoveryStatus = onGoingRecoveries.remove(shardId);
        if (peerRecoveryStatus != null) {
            // just mark it as canceled as well, just in case there are in flight requests
            // coming from the recovery target
            peerRecoveryStatus.canceled = true;
            // clean open index outputs
            for (Map.Entry<String, IndexOutput> entry : peerRecoveryStatus.openIndexOutputs.entrySet()) {
                synchronized (entry.getValue()) {
@ -310,6 +347,10 @@ public class RecoveryTarget extends AbstractComponent {
                // shard is getting closed on us
                throw new IndexShardClosedException(shard.shardId());
            }
            if (onGoingRecovery.canceled) {
                onGoingRecovery.sentCanceledToSource = true;
                throw new IndexShardClosedException(shard.shardId());
            }
            onGoingRecovery.stage = RecoveryStatus.Stage.TRANSLOG;
            shard.performRecoveryPrepareForTranslog();
@ -332,15 +373,19 @@ public class RecoveryTarget extends AbstractComponent {
        @Override
        public void messageReceived(RecoveryFinalizeRecoveryRequest request, TransportChannel channel) throws Exception {
            InternalIndexShard shard = (InternalIndexShard) indicesService.indexServiceSafe(request.shardId().index().name()).shardSafe(request.shardId().id());
-            RecoveryStatus peerRecoveryStatus = onGoingRecoveries.get(shard.shardId());
+            RecoveryStatus onGoingRecovery = onGoingRecoveries.get(shard.shardId());
-            if (peerRecoveryStatus == null) {
+            if (onGoingRecovery == null) {
                // shard is getting closed on us
                throw new IndexShardClosedException(shard.shardId());
            }
-            peerRecoveryStatus.stage = RecoveryStatus.Stage.FINALIZE;
+            if (onGoingRecovery.canceled) {
-            shard.performRecoveryFinalization(false, peerRecoveryStatus);
+                onGoingRecovery.sentCanceledToSource = true;
-            peerRecoveryStatus.time = System.currentTimeMillis() - peerRecoveryStatus.startTime;
+                throw new IndexShardClosedException(shard.shardId());
-            peerRecoveryStatus.stage = RecoveryStatus.Stage.DONE;
+            }
            onGoingRecovery.stage = RecoveryStatus.Stage.FINALIZE;
            shard.performRecoveryFinalization(false, onGoingRecovery);
            onGoingRecovery.time = System.currentTimeMillis() - onGoingRecovery.startTime;
            onGoingRecovery.stage = RecoveryStatus.Stage.DONE;
            channel.sendResponse(VoidStreamable.INSTANCE);
        }
    }
@ -360,17 +405,35 @@ public class RecoveryTarget extends AbstractComponent {
        @Override
        public void messageReceived(RecoveryTranslogOperationsRequest request, TransportChannel channel) throws Exception {
-            InternalIndexShard shard = (InternalIndexShard) indicesService.indexServiceSafe(request.shardId().index().name()).shardSafe(request.shardId().id());
+            RecoveryStatus onGoingRecovery = onGoingRecoveries.get(request.shardId());
            for (Translog.Operation operation : request.operations()) {
                shard.performRecoveryOperation(operation);
            }
            RecoveryStatus onGoingRecovery = onGoingRecoveries.get(shard.shardId());
            if (onGoingRecovery == null) {
                // shard is getting closed on us
-                throw new IndexShardClosedException(shard.shardId());
+                throw new IndexShardClosedException(request.shardId());
            }
            if (onGoingRecovery.canceled) {
                onGoingRecovery.sentCanceledToSource = true;
                throw new IndexShardClosedException(request.shardId());
            }
            InternalIndexShard shard = (InternalIndexShard) indicesService.indexServiceSafe(request.shardId().index().name()).shardSafe(request.shardId().id());
            for (Translog.Operation operation : request.operations()) {
                if (onGoingRecovery.canceled) {
                    onGoingRecovery.sentCanceledToSource = true;
                    throw new IndexShardClosedException(request.shardId());
                }
                shard.performRecoveryOperation(operation);
                onGoingRecovery.currentTranslogOperations++;
            }
            onGoingRecovery = onGoingRecoveries.get(request.shardId());
            if (onGoingRecovery == null) {
                // shard is getting closed on us
                throw new IndexShardClosedException(request.shardId());
            }
            if (onGoingRecovery.canceled) {
                onGoingRecovery.sentCanceledToSource = true;
                throw new IndexShardClosedException(request.shardId());
            }
            onGoingRecovery.currentTranslogOperations += request.operations().size();
            channel.sendResponse(VoidStreamable.INSTANCE);
        }
@ -396,6 +459,10 @@ public class RecoveryTarget extends AbstractComponent {
                // shard is getting closed on us
                throw new IndexShardClosedException(shard.shardId());
            }
            if (onGoingRecovery.canceled) {
                onGoingRecovery.sentCanceledToSource = true;
                throw new IndexShardClosedException(shard.shardId());
            }
            onGoingRecovery.phase1FileNames = request.phase1FileNames;
            onGoingRecovery.phase1FileSizes = request.phase1FileSizes;
            onGoingRecovery.phase1ExistingFileNames = request.phase1ExistingFileNames;
@ -427,6 +494,10 @@ public class RecoveryTarget extends AbstractComponent {
                // shard is getting closed on us
                throw new IndexShardClosedException(shard.shardId());
            }
            if (onGoingRecovery.canceled) {
                onGoingRecovery.sentCanceledToSource = true;
                throw new IndexShardClosedException(shard.shardId());
            }
            // first, we go and move files that were created with the recovery id suffix to
            // the actual names, its ok if we have a corrupted index here, since we have replicas
@ -495,6 +566,10 @@ public class RecoveryTarget extends AbstractComponent {
                // shard is getting closed on us
                throw new IndexShardClosedException(shard.shardId());
            }
            if (onGoingRecovery.canceled) {
                onGoingRecovery.sentCanceledToSource = true;
                throw new IndexShardClosedException(shard.shardId());
            }
            IndexOutput indexOutput;
            if (request.position() == 0) {
                // first request
--- a/src/test/java/org/elasticsearch/test/stress/rollingrestart/QuickRollingRestartStressTest.java
+++ b/src/test/java/org/elasticsearch/test/stress/rollingrestart/QuickRollingRestartStressTest.java
@ -91,7 +91,9 @@ public class QuickRollingRestartStressTest {
            if (clusterHealthResponse.timedOut()) {
                System.err.println("--> timed out waiting for green state...");
                ClusterState state = client.client().admin().cluster().prepareState().execute().actionGet().state();
                System.out.println(state.nodes().prettyPrint());
                System.out.println(state.routingTable().prettyPrint());
                System.out.println(state.routingNodes().prettyPrint());
                throw new ElasticSearchException("timed out waiting for green state");
            } else {
                System.out.println("--> got green status");