SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1653879 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2015-01-22 14:36:08 +00:00
parent ec01b5aeb5
commit bea1e9c608
2 changed files with 22 additions and 14 deletions

View File

@ -527,6 +527,9 @@ Bug Fixes
* SOLR-6640: Close searchers before rollback and recovery to avoid index corruption. * SOLR-6640: Close searchers before rollback and recovery to avoid index corruption.
(Robert Muir, Varun Thacker, shalin) (Robert Muir, Varun Thacker, shalin)
* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
(shalin)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -193,7 +193,7 @@ public class LeaderInitiatedRecoveryThread extends Thread {
// additional safeguard against the replica trying to be in the active state // additional safeguard against the replica trying to be in the active state
// before acknowledging the leader initiated recovery command // before acknowledging the leader initiated recovery command
if (continueTrying && collection != null && shardId != null) { if (collection != null && shardId != null) {
try { try {
// call out to ZooKeeper to get the leader-initiated recovery state // call out to ZooKeeper to get the leader-initiated recovery state
String lirState = String lirState =
@ -218,20 +218,25 @@ public class LeaderInitiatedRecoveryThread extends Thread {
List<ZkCoreNodeProps> replicaProps = List<ZkCoreNodeProps> replicaProps =
zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName); zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName);
if (replicaProps != null && replicaProps.size() > 0) { if (replicaProps != null && replicaProps.size() > 0) {
String replicaState = replicaProps.get(0).getState(); for (ZkCoreNodeProps prop : replicaProps) {
if (ZkStateReader.ACTIVE.equals(replicaState)) { if (replicaCoreNodeName.equals(((Replica) prop.getNodeProps()).getName())) {
// replica published its state as "active", String replicaState = prop.getState();
// which is bad if lirState is still "down" if (ZkStateReader.ACTIVE.equals(replicaState)) {
if (ZkStateReader.DOWN.equals(lirState)) { // replica published its state as "active",
// OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery // which is bad if lirState is still "down"
// so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here if (ZkStateReader.DOWN.equals(lirState)) {
log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;" // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
+ " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName); // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
zkController.ensureReplicaInLeaderInitiatedRecovery(collection, log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
shardId, replicaUrl, nodeProps, true); // force republish state to "down" + " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
shardId, replicaUrl, nodeProps, true); // force republish state to "down"
}
}
break;
} }
} }
} }
} }
} catch (Exception ignoreMe) { } catch (Exception ignoreMe) {
log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName); log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);