SOLR-13945: SPLITSHARD can cause data loss due to rollback when final commit fails

This commit is contained in:
Ishan Chattopadhyaya 2019-12-12 08:58:34 +05:30
parent 861c77cb17
commit 8db8ab3be2
2 changed files with 32 additions and 3 deletions

View File

@ -250,6 +250,9 @@ Bug Fixes
* SOLR-13953: Prometheus exporter in SolrCloud mode limited to 100 nodes (Alex Jablonski via Erick Erickson)
* SOLR-13945: Fix: SPLITSHARD can cause data loss on a failure to commit after the sub-shards are active and a rollback
is done to make parent shard active again (Ishan Chattopadhyaya, ab)
Other Changes
---------------------

View File

@ -542,6 +542,12 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
// always gets a chance to execute. See SOLR-7673
if (repFactor == 1) {
// A commit is needed so that documents are visible when the sub-shard replicas come up
// (Note: This commit used to be after the state switch, but was brought here before the state switch
// as per SOLR-13945 so that sub shards don't come up empty, momentarily, after being marked active)
t = timings.sub("finalCommit");
ocmh.commit(results, slice.get(), parentShardLeader);
t.stop();
// switch sub shard states to 'active'
log.info("Replication factor is 1 so switching shard states");
Map<String, Object> propMap = new HashMap<>();
@ -583,9 +589,14 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
log.info("Successfully created all replica shards for all sub-slices " + subSlices);
// The final commit was added in SOLR-4997 so that documents are visible
// when the sub-shard replicas come up
if (repFactor > 1) {
t = timings.sub("finalCommit");
ocmh.commit(results, slice.get(), parentShardLeader);
t.stop();
}
if (withTiming) {
results.add(CommonParams.TIMING, timings.asNamedList());
}
@ -675,6 +686,21 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
return;
}
// If parent is inactive and all sub shards are active, then rolling back
// to make the parent active again will cause data loss.
if (coll.getSlice(parentShard).getState() == Slice.State.INACTIVE) {
boolean allSubSlicesActive = true;
for (String sub: subSlices) {
if (coll.getSlice(sub).getState() != Slice.State.ACTIVE) {
allSubSlicesActive = false;
break;
}
}
if (allSubSlicesActive) {
return;
}
}
// set already created sub shards states to CONSTRUCTION - this prevents them
// from entering into RECOVERY or ACTIVE (SOLR-9455)
final Map<String, Object> propMap = new HashMap<>();