mirror of https://github.com/apache/lucene.git
SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1653879 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ec01b5aeb5
commit
bea1e9c608
|
@ -527,6 +527,9 @@ Bug Fixes
|
|||
* SOLR-6640: Close searchers before rollback and recovery to avoid index corruption.
|
||||
(Robert Muir, Varun Thacker, shalin)
|
||||
|
||||
* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
|
||||
(shalin)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -193,7 +193,7 @@ public class LeaderInitiatedRecoveryThread extends Thread {
|
|||
|
||||
// additional safeguard against the replica trying to be in the active state
|
||||
// before acknowledging the leader initiated recovery command
|
||||
if (continueTrying && collection != null && shardId != null) {
|
||||
if (collection != null && shardId != null) {
|
||||
try {
|
||||
// call out to ZooKeeper to get the leader-initiated recovery state
|
||||
String lirState =
|
||||
|
@ -218,20 +218,25 @@ public class LeaderInitiatedRecoveryThread extends Thread {
|
|||
List<ZkCoreNodeProps> replicaProps =
|
||||
zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName);
|
||||
if (replicaProps != null && replicaProps.size() > 0) {
|
||||
String replicaState = replicaProps.get(0).getState();
|
||||
if (ZkStateReader.ACTIVE.equals(replicaState)) {
|
||||
// replica published its state as "active",
|
||||
// which is bad if lirState is still "down"
|
||||
if (ZkStateReader.DOWN.equals(lirState)) {
|
||||
// OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
|
||||
// so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
|
||||
log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
|
||||
+ " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
|
||||
zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
|
||||
shardId, replicaUrl, nodeProps, true); // force republish state to "down"
|
||||
for (ZkCoreNodeProps prop : replicaProps) {
|
||||
if (replicaCoreNodeName.equals(((Replica) prop.getNodeProps()).getName())) {
|
||||
String replicaState = prop.getState();
|
||||
if (ZkStateReader.ACTIVE.equals(replicaState)) {
|
||||
// replica published its state as "active",
|
||||
// which is bad if lirState is still "down"
|
||||
if (ZkStateReader.DOWN.equals(lirState)) {
|
||||
// OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
|
||||
// so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
|
||||
log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
|
||||
+ " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
|
||||
zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
|
||||
shardId, replicaUrl, nodeProps, true); // force republish state to "down"
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception ignoreMe) {
|
||||
log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);
|
||||
|
|
Loading…
Reference in New Issue