SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1653879 13f79535-47bb-0310-9956-ffa450edef68
2015-01-22 14:36:08 +00:00 · 2015-01-22 14:36:08 +00:00 · bea1e9c608
parent ec01b5aeb5
commit bea1e9c608
2 changed files with 22 additions and 14 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -527,6 +527,9 @@ Bug Fixes
 * SOLR-6640: Close searchers before rollback and recovery to avoid index corruption.
  (Robert Muir, Varun Thacker, shalin)

+* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
+  (shalin)
+
 Optimizations
 ----------------------

--- a/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java
@ -193,7 +193,7 @@ public class LeaderInitiatedRecoveryThread extends Thread {

        // additional safeguard against the replica trying to be in the active state
        // before acknowledging the leader initiated recovery command
-        if (continueTrying && collection != null && shardId != null) {
+        if (collection != null && shardId != null) {
          try {
            // call out to ZooKeeper to get the leader-initiated recovery state
            String lirState = 
@ -218,20 +218,25 @@ public class LeaderInitiatedRecoveryThread extends Thread {
              List<ZkCoreNodeProps> replicaProps = 
                  zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName);
              if (replicaProps != null && replicaProps.size() > 0) {
-                String replicaState = replicaProps.get(0).getState();
-                if (ZkStateReader.ACTIVE.equals(replicaState)) {
-                  // replica published its state as "active", 
-                  // which is bad if lirState is still "down"
-                  if (ZkStateReader.DOWN.equals(lirState)) {
-                    // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
-                    // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
-                    log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
-                        + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
-                    zkController.ensureReplicaInLeaderInitiatedRecovery(collection, 
-                        shardId, replicaUrl, nodeProps, true); // force republish state to "down"
+                for (ZkCoreNodeProps prop : replicaProps) {
+                  if (replicaCoreNodeName.equals(((Replica) prop.getNodeProps()).getName())) {
+                    String replicaState = prop.getState();
+                    if (ZkStateReader.ACTIVE.equals(replicaState)) {
+                      // replica published its state as "active",
+                      // which is bad if lirState is still "down"
+                      if (ZkStateReader.DOWN.equals(lirState)) {
+                        // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
+                        // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
+                        log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
+                            + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
+                        zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
+                            shardId, replicaUrl, nodeProps, true); // force republish state to "down"
+                      }
+                    }
+                    break;
                  }
-                }                    
-              }                    
+                }
+              }
            }                  
          } catch (Exception ignoreMe) {
            log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);