SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1653879 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2015-01-22 14:36:08 +00:00
parent ec01b5aeb5
commit bea1e9c608
2 changed files with 22 additions and 14 deletions

View File

@ -527,6 +527,9 @@ Bug Fixes
* SOLR-6640: Close searchers before rollback and recovery to avoid index corruption.
(Robert Muir, Varun Thacker, shalin)
* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
(shalin)
Optimizations
----------------------

View File

@ -193,7 +193,7 @@ public class LeaderInitiatedRecoveryThread extends Thread {
// additional safeguard against the replica trying to be in the active state
// before acknowledging the leader initiated recovery command
if (continueTrying && collection != null && shardId != null) {
if (collection != null && shardId != null) {
try {
// call out to ZooKeeper to get the leader-initiated recovery state
String lirState =
@ -218,20 +218,25 @@ public class LeaderInitiatedRecoveryThread extends Thread {
List<ZkCoreNodeProps> replicaProps =
zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName);
if (replicaProps != null && replicaProps.size() > 0) {
String replicaState = replicaProps.get(0).getState();
if (ZkStateReader.ACTIVE.equals(replicaState)) {
// replica published its state as "active",
// which is bad if lirState is still "down"
if (ZkStateReader.DOWN.equals(lirState)) {
// OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
// so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
+ " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
shardId, replicaUrl, nodeProps, true); // force republish state to "down"
for (ZkCoreNodeProps prop : replicaProps) {
if (replicaCoreNodeName.equals(((Replica) prop.getNodeProps()).getName())) {
String replicaState = prop.getState();
if (ZkStateReader.ACTIVE.equals(replicaState)) {
// replica published its state as "active",
// which is bad if lirState is still "down"
if (ZkStateReader.DOWN.equals(lirState)) {
// OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
// so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
+ " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
shardId, replicaUrl, nodeProps, true); // force republish state to "down"
}
}
break;
}
}
}
}
}
}
} catch (Exception ignoreMe) {
log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);