SOLR-8085: Fix a variety of issues that can result in replicas getting out of sync.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1706423 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2015-10-02 14:40:13 +00:00
parent b095fdfd9f
commit 88b43c815e
3 changed files with 44 additions and 31 deletions

View File

@ -221,6 +221,8 @@ Bug Fixes
* SOLR-8095: Allow disabling HDFS Locality Metrics and disable by default as it may have performance
implications on rapidly changing indexes. (Mike Drob via Mark Miller)
* SOLR-8085: Fix a variety of issues that can result in replicas getting out of sync. (yonik, Mark Miller)
Optimizations
----------------------

View File

@ -54,8 +54,6 @@ import org.apache.solr.logging.MDCLoggingContext;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.request.SolrRequestInfo;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.update.CommitUpdateCommand;
import org.apache.solr.update.PeerSync;
@ -340,6 +338,10 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
return;
}
log.info("Begin buffering updates. core=" + coreName);
ulog.bufferUpdates();
replayed = false;
log.info("Publishing state of core " + core.getName() + " as recovering, leader is " + leaderUrl + " and I am "
+ ourUrl);
zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
@ -390,7 +392,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
new ModifiableSolrParams());
// force open a new searcher
core.getUpdateHandler().commit(new CommitUpdateCommand(req, false));
log.info("PeerSync Recovery was successful - registering as Active.");
log.info("PeerSync stage of recovery was successful.");
// solrcloud_debug
if (log.isDebugEnabled()) {
@ -410,11 +412,12 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
log.debug("Error in solrcloud_debug block", e);
}
}
// sync success - register as active and return
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
log.info("Replaying updates buffered during PeerSync.");
replay(core);
replayed = true;
// sync success
successfulRecovery = true;
close = true;
return;
}
@ -427,10 +430,6 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
}
log.info("Starting Replication Recovery.");
log.info("Begin buffering updates.");
ulog.bufferUpdates();
replayed = false;
try {
@ -449,31 +448,40 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
break;
}
log.info("Replication Recovery was successful - registering as Active.");
// if there are pending recovery requests, don't advert as active
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
close = true;
log.info("Replication Recovery was successful.");
successfulRecovery = true;
recoveryListener.recovered();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
log.warn("Recovery was interrupted", e);
close = true;
} catch (Exception e) {
SolrException.log(log, "Error while trying to recover", e);
} finally {
if (!replayed) {
try {
ulog.dropBufferedUpdates();
} catch (Exception e) {
SolrException.log(log, "", e);
}
}
}
} catch (Exception e) {
SolrException.log(log, "Error while trying to recover.", e);
SolrException.log(log, "Error while trying to recover. core=" + coreName, e);
} finally {
if (!replayed) {
try {
ulog.dropBufferedUpdates();
} catch (Exception e) {
SolrException.log(log, "", e);
}
}
if (successfulRecovery) {
log.info("Registering as Active after recovery.");
try {
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
} catch (Exception e) {
log.error("Could not publish as ACTIVE after succesful recovery", e);
successfulRecovery = false;
}
if (successfulRecovery) {
close = true;
recoveryListener.recovered();
}
}
}
if (!successfulRecovery) {

View File

@ -41,7 +41,7 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
public static Logger log = LoggerFactory.getLogger(DefaultSolrCoreState.class);
private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
private final Object recoveryLock = new Object();
private final ActionThrottle recoveryThrottle = new ActionThrottle("recovery", 10000);
@ -59,6 +59,10 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
private RecoveryStrategy recoveryStrat;
private volatile boolean lastReplicationSuccess = true;
// will we attempt recovery as if we just started up (i.e. use starting versions rather than recent versions for peersync
// so we aren't looking at update versions that have started buffering since we came up.
private volatile boolean recoveringAfterStartup = true;
private RefCounted<IndexWriter> refCntWriter;
protected final ReentrantLock commitLock = new ReentrantLock();
@ -272,10 +276,6 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
if (closed) return;
}
// if true, we are recovering after startup and shouldn't have (or be receiving) additional updates (except for
// local tlog recovery)
boolean recoveringAfterStartup = recoveryStrat == null;
recoveryThrottle.minimumWaitBetweenActions();
recoveryThrottle.markAttemptingAction();
@ -310,11 +310,14 @@ public final class DefaultSolrCoreState extends SolrCoreState implements Recover
}
}
/** called from recoveryStrat on a successful recovery */
@Override
public void recovered() {
recoveringAfterStartup = false; // once we have successfully recovered, we no longer need to act as if we are recovering after startup
recoveryRunning = false;
}
/** called from recoveryStrat on a failed recovery */
@Override
public void failed() {
recoveryRunning = false;