[Recovery] don't start a gateway recovery if source node is not found

Due to change introduced in #6825, we now start a local gateway recovery for replicas, if the source node can not be found. The recovery then fails because we never recover replicas from disk. Closes #6879
2014-07-15 15:09:44 +02:00 · 2014-07-15 15:09:44 +02:00 · 84ad29ddfa
parent d869163b66
commit 84ad29ddfa
1 changed files with 13 additions and 7 deletions
--- a/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java
+++ b/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java
@ -648,10 +648,14 @@ public class IndicesClusterStateService extends AbstractLifecycleComponent<Indic
            }
        }

-        // figure out where to recover from (node or disk, in which case sourceNode is null)
+        // if we're in peer recovery, try to find out the source node now so in case it fails, we will not create the index shard
        DiscoveryNode sourceNode = null;
        if (isPeerRecovery(shardRouting)) {
            sourceNode = findSourceNodeForPeerRecovery(routingTable, nodes, shardRouting);
+            if (sourceNode == null) {
+                logger.trace("ignoring initializing shard {} - no source node can be found.", shardRouting.shardId());
+                return;
+            }
        }

        // if there is no shard, create it
@ -702,13 +706,15 @@ public class IndicesClusterStateService extends AbstractLifecycleComponent<Indic
            return;
        }

-        if (sourceNode != null) {
+        if (isPeerRecovery(shardRouting)) {
            try {
+
+                assert sourceNode != null : "peer recovery started but sourceNode is null";
+
                // we don't mark this one as relocated at the end.
                // For primaries: requests in any case are routed to both when its relocating and that way we handle
                //    the edge case where its mark as relocated, and we might need to roll it back...
                // For replicas: we are recovering a backup from a primary
-
                RecoveryState.Type type = shardRouting.primary() ? RecoveryState.Type.RELOCATION : RecoveryState.Type.REPLICA;
                final Store store = indexShard.store();
                final StartRecoveryRequest request;
@ -716,7 +722,7 @@ public class IndicesClusterStateService extends AbstractLifecycleComponent<Indic
                try {
                    store.failIfCorrupted();
                    request = new StartRecoveryRequest(indexShard.shardId(), sourceNode, nodes.localNode(),
-                        false, store.getMetadata().asMap(), type, recoveryIdGenerator.incrementAndGet());
+                            false, store.getMetadata().asMap(), type, recoveryIdGenerator.incrementAndGet());
                } finally {
                    store.decRef();
                }
@ -763,7 +769,7 @@ public class IndicesClusterStateService extends AbstractLifecycleComponent<Indic
                    // only recover from started primary, if we can't find one, we will do it next round
                    sourceNode = nodes.get(entry.currentNodeId());
                    if (sourceNode == null) {
-                        logger.trace("can't find replica source node because primary shard {} is assigned to an unknown node. ignoring.", entry);
+                        logger.trace("can't find replica source node because primary shard {} is assigned to an unknown node.", entry);
                        return null;
                    }
                    break;
@ -771,12 +777,12 @@ public class IndicesClusterStateService extends AbstractLifecycleComponent<Indic
            }

            if (sourceNode == null) {
-                logger.trace("can't find replica source node for {} because a primary shard can not be found. ignoring.", shardRouting.shardId());
+                logger.trace("can't find replica source node for {} because a primary shard can not be found.", shardRouting.shardId());
            }
        } else if (shardRouting.relocatingNodeId() != null) {
            sourceNode = nodes.get(shardRouting.relocatingNodeId());
            if (sourceNode == null) {
-                logger.trace("can't find relocation source node for shard {} because it is assigned to an unknown node [{}]. ignoring.", shardRouting.shardId(), shardRouting.relocatingNodeId());
+                logger.trace("can't find relocation source node for shard {} because it is assigned to an unknown node [{}].", shardRouting.shardId(), shardRouting.relocatingNodeId());
            }
        } else {
            throw new ElasticsearchIllegalStateException("trying to find source node for peer recovery when routing state means no peer recovery: " + shardRouting);