From 8db8ab3be23b029b5b32e06f2682a905d4e707b2 Mon Sep 17 00:00:00 2001 From: Ishan Chattopadhyaya Date: Thu, 12 Dec 2019 08:58:34 +0530 Subject: [PATCH] SOLR-13945: SPLITSHARD can cause data loss due to rollback when final commit fails --- solr/CHANGES.txt | 3 ++ .../cloud/api/collections/SplitShardCmd.java | 32 +++++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 1fdc2c87b14..499e8f959e8 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -250,6 +250,9 @@ Bug Fixes * SOLR-13953: Prometheus exporter in SolrCloud mode limited to 100 nodes (Alex Jablonski via Erick Erickson) +* SOLR-13945: Fix: SPLITSHARD can cause data loss on a failure to commit after the sub-shards are active and a rollback + is done to make parent shard active again (Ishan Chattopadhyaya, ab) + Other Changes --------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java index e08c6e530f1..333051a654e 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java @@ -542,6 +542,12 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { // always gets a chance to execute. See SOLR-7673 if (repFactor == 1) { + // A commit is needed so that documents are visible when the sub-shard replicas come up + // (Note: This commit used to be after the state switch, but was brought here before the state switch + // as per SOLR-13945 so that sub shards don't come up empty, momentarily, after being marked active) + t = timings.sub("finalCommit"); + ocmh.commit(results, slice.get(), parentShardLeader); + t.stop(); // switch sub shard states to 'active' log.info("Replication factor is 1 so switching shard states"); Map propMap = new HashMap<>(); @@ -583,9 +589,14 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { log.info("Successfully created all replica shards for all sub-slices " + subSlices); - t = timings.sub("finalCommit"); - ocmh.commit(results, slice.get(), parentShardLeader); - t.stop(); + // The final commit was added in SOLR-4997 so that documents are visible + // when the sub-shard replicas come up + if (repFactor > 1) { + t = timings.sub("finalCommit"); + ocmh.commit(results, slice.get(), parentShardLeader); + t.stop(); + } + if (withTiming) { results.add(CommonParams.TIMING, timings.asNamedList()); } @@ -675,6 +686,21 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd { return; } + // If parent is inactive and all sub shards are active, then rolling back + // to make the parent active again will cause data loss. + if (coll.getSlice(parentShard).getState() == Slice.State.INACTIVE) { + boolean allSubSlicesActive = true; + for (String sub: subSlices) { + if (coll.getSlice(sub).getState() != Slice.State.ACTIVE) { + allSubSlicesActive = false; + break; + } + } + if (allSubSlicesActive) { + return; + } + } + // set already created sub shards states to CONSTRUCTION - this prevents them // from entering into RECOVERY or ACTIVE (SOLR-9455) final Map propMap = new HashMap<>();