diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index cd0cd07ff80..b47d3fc9da2 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -527,6 +527,9 @@ Bug Fixes * SOLR-6640: Close searchers before rollback and recovery to avoid index corruption. (Robert Muir, Varun Thacker, shalin) +* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState. + (shalin) + Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java b/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java index 489a52e028e..a046fb68e9c 100644 --- a/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java +++ b/solr/core/src/java/org/apache/solr/cloud/LeaderInitiatedRecoveryThread.java @@ -193,7 +193,7 @@ public class LeaderInitiatedRecoveryThread extends Thread { // additional safeguard against the replica trying to be in the active state // before acknowledging the leader initiated recovery command - if (continueTrying && collection != null && shardId != null) { + if (collection != null && shardId != null) { try { // call out to ZooKeeper to get the leader-initiated recovery state String lirState = @@ -218,20 +218,25 @@ public class LeaderInitiatedRecoveryThread extends Thread { List replicaProps = zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName); if (replicaProps != null && replicaProps.size() > 0) { - String replicaState = replicaProps.get(0).getState(); - if (ZkStateReader.ACTIVE.equals(replicaState)) { - // replica published its state as "active", - // which is bad if lirState is still "down" - if (ZkStateReader.DOWN.equals(lirState)) { - // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery - // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here - log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;" - + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName); - zkController.ensureReplicaInLeaderInitiatedRecovery(collection, - shardId, replicaUrl, nodeProps, true); // force republish state to "down" + for (ZkCoreNodeProps prop : replicaProps) { + if (replicaCoreNodeName.equals(((Replica) prop.getNodeProps()).getName())) { + String replicaState = prop.getState(); + if (ZkStateReader.ACTIVE.equals(replicaState)) { + // replica published its state as "active", + // which is bad if lirState is still "down" + if (ZkStateReader.DOWN.equals(lirState)) { + // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery + // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here + log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;" + + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName); + zkController.ensureReplicaInLeaderInitiatedRecovery(collection, + shardId, replicaUrl, nodeProps, true); // force republish state to "down" + } + } + break; } - } - } + } + } } } catch (Exception ignoreMe) { log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);