SOLR-11293: Potential data loss in TLOG replicas after replication failures

This commit is contained in:
Noble Paul 2017-08-31 22:42:19 +09:30
parent 913a2c4345
commit 1d31370577
5 changed files with 12 additions and 10 deletions

View File

@ -116,6 +116,8 @@ Bug Fixes
* SOLR-11278: Stopping CDCR should cancel a running bootstrap operation. (Amrit Sarkar, shalin)
* SOLR-11293: Potential data loss in TLOG replicas after replication failures (noble)
Optimizations
----------------------

View File

@ -89,7 +89,6 @@ public class ReplicateFromLeader {
if (pollSuccess) {
String commitVersion = getCommitVersion(core);
if (commitVersion == null) return;
if (Long.parseLong(commitVersion) == lastVersion) return;
UpdateLog updateLog = solrCore.getUpdateHandler().getUpdateLog();
SolrQueryRequest req = new LocalSolrQueryRequest(core,
new ModifiableSolrParams());

View File

@ -989,7 +989,7 @@ public class ZkController {
if (isTlogReplicaAndNotLeader) {
String commitVersion = ReplicateFromLeader.getCommitVersion(core);
if (commitVersion != null) {
ulog.copyOverOldUpdates(Long.parseLong(commitVersion));
ulog.copyOverOldUpdates(Long.parseLong(commitVersion), true);
}
}
// we will call register again after zk expiration and on reload

View File

@ -1158,10 +1158,12 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
protected void copyAndSwitchToNewTlog(CommitUpdateCommand cuc) {
synchronized (this) {
if (tlog == null) return;
if (tlog == null && prevTlog == null && prevMapLog2 == null && logs.isEmpty()) {
return;
}
preCommit(cuc);
try {
copyOverOldUpdates(cuc.getVersion());
copyOverOldUpdates(cuc.getVersion(), false);
} finally {
postCommit(cuc);
}
@ -1171,8 +1173,9 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
/**
* Copy over updates from prevTlog or last tlog (in tlog folder) to a new tlog
* @param commitVersion any updates that have version larger than the commitVersion will be copied over
* @param omitCommitted if a tlog is already committed then don't read it
*/
public void copyOverOldUpdates(long commitVersion) {
public void copyOverOldUpdates(long commitVersion, boolean omitCommitted) {
TransactionLog oldTlog = prevTlog;
if (oldTlog == null && !logs.isEmpty()) {
oldTlog = logs.getFirst();
@ -1182,9 +1185,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
}
try {
if (oldTlog.endsWithCommit()) {
return;
}
if (omitCommitted && oldTlog.endsWithCommit()) return;
} catch (IOException e) {
log.warn("Exception reading log", e);
return;

View File

@ -30,7 +30,6 @@ import java.util.Properties;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.JSONTestUtil;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
@ -66,7 +65,6 @@ import org.slf4j.LoggerFactory;
@Slow
@SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776")
@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-11293")
public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@ -222,6 +220,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 1);
// Now introduce a network partition between the leader and 1 replica, so a minRf of 2 is still achieved
log.info("partitioning replica : " + notLeaders.get(0));
SocketProxy proxy0 = getProxyForReplica(notLeaders.get(0));
proxy0.close();
@ -241,6 +240,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
Replica partitionedReplica = slice.getReplica(notLeaders.get(0).getName());
assertEquals("The partitioned replica did not get marked down",
Replica.State.DOWN.toString(), partitionedReplica.getStr(ZkStateReader.STATE_PROP));
log.info("un-partitioning replica : " + notLeaders.get(0));
proxy0.reopen();