SOLR-6944: ReplicationFactorTest and HttpPartitionTest both fail with org.apache.http.NoHttpResponseException: The target server failed to respond

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1656056 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2015-01-30 16:56:24 +00:00
parent 669e9cf617
commit fd35bd5ae4
2 changed files with 88 additions and 22 deletions

View File

@ -17,14 +17,17 @@ package org.apache.solr.cloud;
* limitations under the License.
*/
import org.apache.http.NoHttpResponseException;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.JSONTestUtil;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.ClusterState;
@ -41,6 +44,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
@ -123,7 +127,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
protected void testLeaderInitiatedRecoveryCRUD() throws Exception {
String testCollectionName = "c8n_crud_1x2";
String shardId = "shard1";
createCollection(testCollectionName, 1, 2, 1);
createCollectionRetry(testCollectionName, 1, 2, 1);
cloudClient.setDefaultCollection(testCollectionName);
Replica leader =
@ -172,7 +176,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
protected void testRf2() throws Exception {
// create a collection that has 1 shard but 2 replicas
String testCollectionName = "c8n_1x2";
createCollection(testCollectionName, 1, 2, 1);
createCollectionRetry(testCollectionName, 1, 2, 1);
cloudClient.setDefaultCollection(testCollectionName);
sendDoc(1);
@ -253,11 +257,12 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
protected void testRf3() throws Exception {
// create a collection that has 1 shard but 2 replicas
String testCollectionName = "c8n_1x3";
createCollection(testCollectionName, 1, 3, 1);
createCollectionRetry(testCollectionName, 1, 3, 1);
cloudClient.setDefaultCollection(testCollectionName);
sendDoc(1);
List<Replica> notLeaders =
ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 3, maxWaitSecsToSeeAllActive);
assertTrue("Expected 2 replicas for collection " + testCollectionName
@ -306,11 +311,27 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
}
}
private void createCollectionRetry(String testCollectionName, int numShards, int replicationFactor, int maxShardsPerNode)
throws SolrServerException, IOException {
CollectionAdminResponse resp = createCollection(testCollectionName, numShards, replicationFactor, maxShardsPerNode);
if (resp.getResponse().get("failure") != null) {
CollectionAdminRequest.Delete req = new CollectionAdminRequest.Delete();
req.setCollectionName(testCollectionName);
req.process(cloudClient);
resp = createCollection(testCollectionName, numShards, replicationFactor, maxShardsPerNode);
if (resp.getResponse().get("failure") != null) {
fail("Could not create " + testCollectionName);
}
}
}
// test inspired by SOLR-6511
protected void testLeaderZkSessionLoss() throws Exception {
String testCollectionName = "c8n_1x2_leader_session_loss";
createCollection(testCollectionName, 1, 2, 1);
createCollectionRetry(testCollectionName, 1, 2, 1);
cloudClient.setDefaultCollection(testCollectionName);
sendDoc(1);
@ -329,7 +350,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
testCollectionName+"; clusterState: "+printClusterStateInfo(testCollectionName), leader);
JettySolrRunner leaderJetty = getJettyOnPort(getReplicaPort(leader));
HttpSolrClient leaderSolr = getHttpSolrClient(leader, testCollectionName);
SolrInputDocument doc = new SolrInputDocument();
doc.addField(id, String.valueOf(2));
doc.addField("a_t", "hello" + 2);
@ -360,7 +381,8 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
// TODO: This test logic seems to be timing dependent and fails on Jenkins
// need to come up with a better approach
log.info("Sending doc 2 to old leader "+leader.getName());
try {
try ( HttpSolrClient leaderSolr = getHttpSolrClient(leader, testCollectionName)) {
leaderSolr.add(doc);
leaderSolr.close();
@ -374,7 +396,7 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
try (HttpSolrClient client = getHttpSolrClient(currentLeader, testCollectionName)) {
client.add(doc); // this should work
}
}
}
List<Replica> participatingReplicas = getActiveOrRecoveringReplicas(testCollectionName, "shard1");
Set<String> replicasToCheck = new HashSet<>();
@ -452,17 +474,37 @@ public class HttpPartitionTest extends AbstractFullDistribZkTestBase {
return new HttpSolrClient(url);
}
protected void sendDoc(int docId) throws Exception {
protected void doSendDoc(int docid) throws Exception {
UpdateRequest up = new UpdateRequest();
up.setParam(UpdateRequest.MIN_REPFACT, String.valueOf(2));
SolrInputDocument doc = new SolrInputDocument();
doc.addField(id, String.valueOf(docId));
doc.addField("a_t", "hello" + docId);
doc.addField(id, String.valueOf(docid));
doc.addField("a_t", "hello" + docid);
up.add(doc);
int minAchievedRf =
cloudClient.getMinAchievedReplicationFactor(cloudClient.getDefaultCollection(), cloudClient.request(up));
}
protected void sendDoc(int docId) throws Exception {
try {
doSendDoc(docId);
} catch (SolrServerException e) {
if (e.getRootCause() instanceof NoHttpResponseException) {
// we don't know if the doc was accepted or not, we send again
Thread.sleep(100);
try {
doSendDoc(docId);
} catch (SolrServerException e2) {
if (e2.getRootCause() instanceof NoHttpResponseException) {
// we don't know if the doc was accepted or not, we send again
Thread.sleep(3000);
doSendDoc(docId);
}
}
}
}
}
/**
* Query the real-time get handler for a specific doc by ID to verify it
* exists in the provided server, using distrib=false so it doesn't route to another replica.

View File

@ -17,11 +17,19 @@ package org.apache.solr.cloud;
* limitations under the License.
*/
import java.io.File;
import java.net.ServerSocket;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.ZkCoreNodeProps;
@ -30,12 +38,6 @@ import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.net.ServerSocket;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
//@AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/SOLR-6157")
/**
@ -128,7 +130,20 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
String shardId = "shard1";
int minRf = 2;
createCollection(testCollectionName, numShards, replicationFactor, maxShardsPerNode);
CollectionAdminResponse resp = createCollection(testCollectionName, numShards, replicationFactor, maxShardsPerNode);
if (resp.getResponse().get("failure") != null) {
CollectionAdminRequest.Delete req = new CollectionAdminRequest.Delete();
req.setCollectionName(testCollectionName);
req.process(cloudClient);
resp = createCollection(testCollectionName, numShards, replicationFactor, maxShardsPerNode);
if (resp.getResponse().get("failure") != null) {
fail("Could not create " + testCollectionName);
}
}
cloudClient.setDefaultCollection(testCollectionName);
List<Replica> replicas =
@ -149,8 +164,8 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
up.add(batch);
Replica leader = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, shardId);
sendNonDirectUpdateRequestReplica(leader, up, 2, testCollectionName);
sendNonDirectUpdateRequestReplica(replicas.get(0), up, 2, testCollectionName);
sendNonDirectUpdateRequestReplicaWithRetry(leader, up, 2, testCollectionName);
sendNonDirectUpdateRequestReplicaWithRetry(replicas.get(0), up, 2, testCollectionName);
// so now kill the replica of shard2 and verify the achieved rf is only 1
List<Replica> shard2Replicas =
@ -162,8 +177,8 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
Thread.sleep(2000);
// shard1 will have rf=2 but shard2 will only have rf=1
sendNonDirectUpdateRequestReplica(leader, up, 1, testCollectionName);
sendNonDirectUpdateRequestReplica(replicas.get(0), up, 1, testCollectionName);
sendNonDirectUpdateRequestReplicaWithRetry(leader, up, 1, testCollectionName);
sendNonDirectUpdateRequestReplicaWithRetry(replicas.get(0), up, 1, testCollectionName);
// heal the partition
getProxyForReplica(shard2Replicas.get(0)).reopen();
@ -171,6 +186,15 @@ public class ReplicationFactorTest extends AbstractFullDistribZkTestBase {
Thread.sleep(2000);
}
protected void sendNonDirectUpdateRequestReplicaWithRetry(Replica replica, UpdateRequest up, int expectedRf, String collection) throws Exception {
try {
sendNonDirectUpdateRequestReplica(replica, up, expectedRf, collection);
} catch (Exception e) {
sendNonDirectUpdateRequestReplica(replica, up, expectedRf, collection);
}
}
@SuppressWarnings("rawtypes")
protected void sendNonDirectUpdateRequestReplica(Replica replica, UpdateRequest up, int expectedRf, String collection) throws Exception {