mirror of https://github.com/apache/lucene.git
SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1653879 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ec01b5aeb5
commit
bea1e9c608
|
@ -527,6 +527,9 @@ Bug Fixes
|
||||||
* SOLR-6640: Close searchers before rollback and recovery to avoid index corruption.
|
* SOLR-6640: Close searchers before rollback and recovery to avoid index corruption.
|
||||||
(Robert Muir, Varun Thacker, shalin)
|
(Robert Muir, Varun Thacker, shalin)
|
||||||
|
|
||||||
|
* SOLR-6847: LeaderInitiatedRecoveryThread compares wrong replica's state with lirState.
|
||||||
|
(shalin)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -193,7 +193,7 @@ public class LeaderInitiatedRecoveryThread extends Thread {
|
||||||
|
|
||||||
// additional safeguard against the replica trying to be in the active state
|
// additional safeguard against the replica trying to be in the active state
|
||||||
// before acknowledging the leader initiated recovery command
|
// before acknowledging the leader initiated recovery command
|
||||||
if (continueTrying && collection != null && shardId != null) {
|
if (collection != null && shardId != null) {
|
||||||
try {
|
try {
|
||||||
// call out to ZooKeeper to get the leader-initiated recovery state
|
// call out to ZooKeeper to get the leader-initiated recovery state
|
||||||
String lirState =
|
String lirState =
|
||||||
|
@ -218,20 +218,25 @@ public class LeaderInitiatedRecoveryThread extends Thread {
|
||||||
List<ZkCoreNodeProps> replicaProps =
|
List<ZkCoreNodeProps> replicaProps =
|
||||||
zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName);
|
zkStateReader.getReplicaProps(collection, shardId, leaderCoreNodeName);
|
||||||
if (replicaProps != null && replicaProps.size() > 0) {
|
if (replicaProps != null && replicaProps.size() > 0) {
|
||||||
String replicaState = replicaProps.get(0).getState();
|
for (ZkCoreNodeProps prop : replicaProps) {
|
||||||
if (ZkStateReader.ACTIVE.equals(replicaState)) {
|
if (replicaCoreNodeName.equals(((Replica) prop.getNodeProps()).getName())) {
|
||||||
// replica published its state as "active",
|
String replicaState = prop.getState();
|
||||||
// which is bad if lirState is still "down"
|
if (ZkStateReader.ACTIVE.equals(replicaState)) {
|
||||||
if (ZkStateReader.DOWN.equals(lirState)) {
|
// replica published its state as "active",
|
||||||
// OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
|
// which is bad if lirState is still "down"
|
||||||
// so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
|
if (ZkStateReader.DOWN.equals(lirState)) {
|
||||||
log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
|
// OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
|
||||||
+ " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
|
// so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
|
||||||
zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
|
log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
|
||||||
shardId, replicaUrl, nodeProps, true); // force republish state to "down"
|
+ " forcing it back to down state to re-run the leader-initiated recovery process; props: "+replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
|
||||||
|
zkController.ensureReplicaInLeaderInitiatedRecovery(collection,
|
||||||
|
shardId, replicaUrl, nodeProps, true); // force republish state to "down"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception ignoreMe) {
|
} catch (Exception ignoreMe) {
|
||||||
log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);
|
log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);
|
||||||
|
|
Loading…
Reference in New Issue