SOLR-5495: Raise the amount of time the test waits for replicas to become active after partitions are healed (to address intermittent Jenkins failures)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1596103 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Timothy Potter 2014-05-20 02:07:26 +00:00
parent 195317786c
commit b1035b1a9e
4 changed files with 31 additions and 14 deletions

View File

@ -321,7 +321,14 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
try (SolrCore core = cc.getCore(coreName)) {
CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
String coll = cloudDesc.getCollectionName();
String shardId = cloudDesc.getShardId();
String shardId = cloudDesc.getShardId();
if (coll == null || shardId == null) {
log.error("Cannot start leader-initiated recovery on new leader (core="+
coreName+") because collection and/or shard is null!");
return;
}
String znodePath = zkController.getLeaderInitiatedRecoveryZnodePath(coll, shardId);
List<String> replicas = null;
try {

View File

@ -1737,7 +1737,15 @@ public final class ZkController {
public boolean ensureReplicaInLeaderInitiatedRecovery(final String collection,
final String shardId, final String replicaUrl, final ZkCoreNodeProps replicaCoreProps, boolean forcePublishState)
throws KeeperException, InterruptedException
{
{
if (collection == null)
throw new IllegalArgumentException("collection parameter cannot be null for starting leader-initiated recovery for replica: "+replicaUrl);
if (shardId == null)
throw new IllegalArgumentException("shard parameter cannot be null for starting leader-initiated recovery for replica: "+replicaUrl);
if (replicaUrl == null)
throw new IllegalArgumentException("replicaUrl parameter cannot be null for starting leader-initiated recovery");
// First, determine if this replica is already in recovery handling
// which is needed because there can be many concurrent errors flooding in

View File

@ -1032,8 +1032,9 @@ public class CoreAdminHandler extends RequestHandlerBase {
boolean onlyIfActiveCheckResult = onlyIfLeaderActive != null && onlyIfLeaderActive && (localState == null || !localState.equals(ZkStateReader.ACTIVE));
log.info("In WaitForState("+waitForState+"): collection="+collection+", shard="+slice.getName()+
", thisCore="+core.getName()+", leaderDoesNotNeedRecovery="+leaderDoesNotNeedRecovery+
", isLeader? "+core.getCoreDescriptor().getCloudDescriptor().isLeader()+
", live="+live+", currentState="+state+", localState="+localState+", nodeName="+nodeName+
", live="+live+", checkLive="+checkLive+", currentState="+state+", localState="+localState+", nodeName="+nodeName+
", coreNodeName="+coreNodeName+", onlyIfActiveCheckResult="+onlyIfActiveCheckResult+", nodeProps: "+nodeProps);
if (!onlyIfActiveCheckResult && nodeProps != null && (state.equals(waitForState) || leaderDoesNotNeedRecovery)) {

View File

@ -26,7 +26,6 @@ import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.JSONTestUtil;
@ -61,6 +60,8 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
// recognizes (and propagates) partitions
private static final long sleepMsBeforeHealPartition = 1000L;
private static final int maxWaitSecsToSeeAllActive = 30;
private Map<URI,SocketProxy> proxies = new HashMap<URI,SocketProxy>();
public HttpPartitionTest() {
@ -156,7 +157,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
sendDoc(1);
Replica notLeader =
ensureAllReplicasAreActive(testCollectionName, 2, 10).get(0);
ensureAllReplicasAreActive(testCollectionName, 2, maxWaitSecsToSeeAllActive).get(0);
// ok, now introduce a network partition between the leader and the replica
SocketProxy proxy = getProxyForReplica(notLeader);
@ -178,7 +179,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
proxy.reopen();
List<Replica> notLeaders =
ensureAllReplicasAreActive(testCollectionName, 2, 20); // shouldn't take 20 secs but just to be safe
ensureAllReplicasAreActive(testCollectionName, 2, maxWaitSecsToSeeAllActive);
sendDoc(3);
@ -210,7 +211,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
proxy.reopen();
}
notLeaders = ensureAllReplicasAreActive(testCollectionName, 2, 20);
notLeaders = ensureAllReplicasAreActive(testCollectionName, 2, maxWaitSecsToSeeAllActive);
// verify all docs received
assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, numDocs + 3);
@ -225,7 +226,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
sendDoc(1);
List<Replica> notLeaders =
ensureAllReplicasAreActive(testCollectionName, 3, 10);
ensureAllReplicasAreActive(testCollectionName, 3, maxWaitSecsToSeeAllActive);
assertTrue("Expected 2 replicas for collection " + testCollectionName
+ " but found " + notLeaders.size() + "; clusterState: "
+ cloudClient.getZkStateReader().getClusterState(),
@ -255,7 +256,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
proxy1.reopen();
// sent 4 docs in so far, verify they are on the leader and replica
notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, 20);
notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, maxWaitSecsToSeeAllActive);
sendDoc(4);
@ -273,7 +274,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
sendDoc(1);
List<Replica> notLeaders =
ensureAllReplicasAreActive(testCollectionName, 3, 10);
ensureAllReplicasAreActive(testCollectionName, 3, maxWaitSecsToSeeAllActive);
assertTrue("Expected 2 replicas for collection " + testCollectionName
+ " but found " + notLeaders.size() + "; clusterState: "
+ cloudClient.getZkStateReader().getClusterState(),
@ -304,7 +305,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
proxy1.reopen();
// sent 4 docs in so far, verify they are on the leader and replica
notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, 20);
notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, maxWaitSecsToSeeAllActive);
sendDoc(4);
@ -317,7 +318,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
JettySolrRunner leaderJetty = getJettyOnPort(getReplicaPort(leader));
// since maxShardsPerNode is 1, we're safe to kill the leader
notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, 20);
notLeaders = ensureAllReplicasAreActive(testCollectionName, 3, maxWaitSecsToSeeAllActive);
proxy0 = getProxyForReplica(notLeaders.get(0));
proxy0.close();
@ -345,9 +346,9 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
Thread.sleep(sleepMsBeforeHealPartition);
Replica newLeader =
cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1", 30000);
cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1", 60000);
assertNotNull("No new leader was elected after 30 seconds", newLeader);
assertNotNull("No new leader was elected after 60 seconds", newLeader);
assertTrue("Expected node "+shouldNotBeNewLeaderNode+
" to NOT be the new leader b/c it was out-of-sync with the old leader! ClusterState: "+