diff --git a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java index 93b2f73eaf8..ebfaca3fcb5 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java +++ b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java @@ -321,7 +321,14 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase { try (SolrCore core = cc.getCore(coreName)) { CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor(); String coll = cloudDesc.getCollectionName(); - String shardId = cloudDesc.getShardId(); + String shardId = cloudDesc.getShardId(); + + if (coll == null || shardId == null) { + log.error("Cannot start leader-initiated recovery on new leader (core="+ + coreName+") because collection and/or shard is null!"); + return; + } + String znodePath = zkController.getLeaderInitiatedRecoveryZnodePath(coll, shardId); List replicas = null; try { diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index b0945824943..5a7981e9dfc 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -1737,7 +1737,15 @@ public final class ZkController { public boolean ensureReplicaInLeaderInitiatedRecovery(final String collection, final String shardId, final String replicaUrl, final ZkCoreNodeProps replicaCoreProps, boolean forcePublishState) throws KeeperException, InterruptedException - { + { + if (collection == null) + throw new IllegalArgumentException("collection parameter cannot be null for starting leader-initiated recovery for replica: "+replicaUrl); + + if (shardId == null) + throw new IllegalArgumentException("shard parameter cannot be null for starting leader-initiated recovery for replica: "+replicaUrl); + + if (replicaUrl == null) + throw new IllegalArgumentException("replicaUrl parameter cannot be null for starting leader-initiated recovery"); // First, determine if this replica is already in recovery handling // which is needed because there can be many concurrent errors flooding in diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index fcc93c37935..61e51e7f8d1 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -1032,8 +1032,9 @@ public class CoreAdminHandler extends RequestHandlerBase { boolean onlyIfActiveCheckResult = onlyIfLeaderActive != null && onlyIfLeaderActive && (localState == null || !localState.equals(ZkStateReader.ACTIVE)); log.info("In WaitForState("+waitForState+"): collection="+collection+", shard="+slice.getName()+ + ", thisCore="+core.getName()+", leaderDoesNotNeedRecovery="+leaderDoesNotNeedRecovery+ ", isLeader? "+core.getCoreDescriptor().getCloudDescriptor().isLeader()+ - ", live="+live+", currentState="+state+", localState="+localState+", nodeName="+nodeName+ + ", live="+live+", checkLive="+checkLive+", currentState="+state+", localState="+localState+", nodeName="+nodeName+ ", coreNodeName="+coreNodeName+", onlyIfActiveCheckResult="+onlyIfActiveCheckResult+", nodeProps: "+nodeProps); if (!onlyIfActiveCheckResult && nodeProps != null && (state.equals(waitForState) || leaderDoesNotNeedRecovery)) { diff --git a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java index efb9cbbf810..78d2e7b7c30 100644 --- a/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java @@ -26,7 +26,6 @@ import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.solr.JSONTestUtil; @@ -61,6 +60,8 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { // recognizes (and propagates) partitions private static final long sleepMsBeforeHealPartition = 1000L; + private static final int maxWaitSecsToSeeAllActive = 30; + private Map proxies = new HashMap(); public HttpPartitionTest() { @@ -156,7 +157,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { sendDoc(1); Replica notLeader = - ensureAllReplicasAreActive(testCollectionName, 2, 10).get(0); + ensureAllReplicasAreActive(testCollectionName, 2, maxWaitSecsToSeeAllActive).get(0); // ok, now introduce a network partition between the leader and the replica SocketProxy proxy = getProxyForReplica(notLeader); @@ -178,7 +179,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { proxy.reopen(); List notLeaders = - ensureAllReplicasAreActive(testCollectionName, 2, 20); // shouldn't take 20 secs but just to be safe + ensureAllReplicasAreActive(testCollectionName, 2, maxWaitSecsToSeeAllActive); sendDoc(3); @@ -210,7 +211,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { proxy.reopen(); } - notLeaders = ensureAllReplicasAreActive(testCollectionName, 2, 20); + notLeaders = ensureAllReplicasAreActive(testCollectionName, 2, maxWaitSecsToSeeAllActive); // verify all docs received assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, numDocs + 3); @@ -225,7 +226,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { sendDoc(1); List notLeaders = - ensureAllReplicasAreActive(testCollectionName, 3, 10); + ensureAllReplicasAreActive(testCollectionName, 3, maxWaitSecsToSeeAllActive); assertTrue("Expected 2 replicas for collection " + testCollectionName + " but found " + notLeaders.size() + "; clusterState: " + cloudClient.getZkStateReader().getClusterState(), @@ -255,7 +256,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { proxy1.reopen(); // sent 4 docs in so far, verify they are on the leader and replica - notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, 20); + notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, maxWaitSecsToSeeAllActive); sendDoc(4); @@ -273,7 +274,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { sendDoc(1); List notLeaders = - ensureAllReplicasAreActive(testCollectionName, 3, 10); + ensureAllReplicasAreActive(testCollectionName, 3, maxWaitSecsToSeeAllActive); assertTrue("Expected 2 replicas for collection " + testCollectionName + " but found " + notLeaders.size() + "; clusterState: " + cloudClient.getZkStateReader().getClusterState(), @@ -304,7 +305,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { proxy1.reopen(); // sent 4 docs in so far, verify they are on the leader and replica - notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, 20); + notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, maxWaitSecsToSeeAllActive); sendDoc(4); @@ -317,7 +318,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { JettySolrRunner leaderJetty = getJettyOnPort(getReplicaPort(leader)); // since maxShardsPerNode is 1, we're safe to kill the leader - notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, 20); + notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, maxWaitSecsToSeeAllActive); proxy0 = getProxyForReplica(notLeaders.get(0)); proxy0.close(); @@ -345,9 +346,9 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase { Thread.sleep(sleepMsBeforeHealPartition); Replica newLeader = - cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1", 30000); + cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1", 60000); - assertNotNull("No new leader was elected after 30 seconds", newLeader); + assertNotNull("No new leader was elected after 60 seconds", newLeader); assertTrue("Expected node "+shouldNotBeNewLeaderNode+ " to NOT be the new leader b/c it was out-of-sync with the old leader! ClusterState: "+