SOLR-5437: DeleteReplicaTest fails constantly both locally and in jenkins

SOLR-5486: Cleanup and harden DeleteInactiveReplicaTest.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1544838 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2013-11-23 17:14:22 +00:00
parent 2ef13e06f0
commit b892c597f7
4 changed files with 127 additions and 133 deletions

View File

@ -287,11 +287,8 @@ public class OverseerCollectionProcessor implements Runnable, ClosableThread {
String baseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP);
String core = replica.getStr(ZkStateReader.CORE_NAME_PROP);
// assume the core exists and try to unload it
if (!Slice.ACTIVE.equals(replica.getStr(Slice.STATE))) {
deleteCoreNode(collectionName, replicaName, replica, core);
if(waitForCoreNodeGone(collectionName, shard, replicaName)) return;
} else {
Map m = ZkNodeProps.makeMap("qt", adminPath, CoreAdminParams.ACTION,
CoreAdminAction.UNLOAD.toString(), CoreAdminParams.CORE, core);
@ -304,18 +301,20 @@ public class OverseerCollectionProcessor implements Runnable, ClosableThread {
try {
shardHandler.submit(sreq, baseUrl, sreq.params);
} catch (Exception e) {
log.info("Exception trying to unload core "+sreq,e);
log.warn("Exception trying to unload core " + sreq, e);
}
if (waitForCoreNodeGone(collectionName, shard, replicaName)) return;//check if the core unload removed the corenode zk enry
deleteCoreNode(collectionName, replicaName, replica, core); // this could be because the core is gone but not updated in ZK yet (race condition)
if(waitForCoreNodeGone(collectionName, shard, replicaName)) return;
}
collectShardResponses(!Slice.ACTIVE.equals(replica.getStr(Slice.STATE)) ? new NamedList() : results, false, null);
if (waitForCoreNodeGone(collectionName, shard, replicaName, 5000)) return;//check if the core unload removed the corenode zk enry
deleteCoreNode(collectionName, replicaName, replica, core); // try and ensure core info is removed from clusterstate
if(waitForCoreNodeGone(collectionName, shard, replicaName, 30000)) return;
throw new SolrException(ErrorCode.SERVER_ERROR, "Could not remove replica : " + collectionName + "/" + shard+"/" + replicaName);
}
private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName) throws InterruptedException {
long waitUntil = System.currentTimeMillis() + 30000;
private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName, int timeoutms) throws InterruptedException {
long waitUntil = System.currentTimeMillis() + timeoutms;
boolean deleted = false;
while (System.currentTimeMillis() < waitUntil) {
Thread.sleep(100);

View File

@ -596,6 +596,15 @@ public class CoreAdminHandler extends RequestHandlerBase {
"No such core exists '" + cname + "'");
} else {
if (coreContainer.getZkController() != null) {
// we are unloading, cancel any ongoing recovery
// so there are no races to publish state
// we will try to cancel again later before close
if (core != null) {
if (coreContainer.getZkController() != null) {
core.getSolrCoreState().cancelRecovery();
}
}
log.info("Unregistering core " + core.getName() + " from cloudstate.");
try {
coreContainer.getZkController().unregister(cname,

View File

@ -56,17 +56,21 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
}
private void deleteInactiveReplicaTest() throws Exception {
String COLL_NAME = "delDeadColl";
String collectionName = "delDeadColl";
createColl(COLL_NAME, client);
createCollection(collectionName, client);
waitForRecoveriesToFinish(collectionName, false);
boolean stopped = false;
JettySolrRunner stoppedJetty = null;
StringBuilder sb = new StringBuilder();
Replica replica1 = null;
Slice shard1 = null;
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
for (JettySolrRunner jetty : jettys) sb.append(jetty.getBaseUrl()).append(",");
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader()
.getClusterState().getCollection(collectionName);
for (JettySolrRunner jetty : jettys)
sb.append(jetty.getBaseUrl()).append(",");
for (Slice slice : testcoll.getActiveSlices()) {
for (Replica replica : slice.getReplicas())
@ -77,7 +81,8 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
} catch (Exception e) {
continue;
}
if (baseUrl.toString().startsWith(replica.getStr(ZkStateReader.BASE_URL_PROP))) {
if (baseUrl.toString().startsWith(
replica.getStr(ZkStateReader.BASE_URL_PROP))) {
stoppedJetty = jetty;
ChaosMonkey.stop(jetty);
replica1 = replica;
@ -88,46 +93,47 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
}
}
/*final Slice shard1 = testcoll.getSlices().iterator().next();
if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
Replica replica1 = shard1.getReplicas().iterator().next();
JettySolrRunner stoppedJetty = null;
StringBuilder sb = new StringBuilder();
for (JettySolrRunner jetty : jettys) {
sb.append(jetty.getBaseUrl()).append(",");
if( jetty.getBaseUrl().toString().startsWith(replica1.getStr(ZkStateReader.BASE_URL_PROP)) ) {
stoppedJetty = jetty;
ChaosMonkey.stop(jetty);
stopped = true;
break;
}
}*/
/*
* final Slice shard1 = testcoll.getSlices().iterator().next();
* if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
* Replica replica1 = shard1.getReplicas().iterator().next();
* JettySolrRunner stoppedJetty = null; StringBuilder sb = new
* StringBuilder(); for (JettySolrRunner jetty : jettys) {
* sb.append(jetty.getBaseUrl()).append(","); if(
* jetty.getBaseUrl().toString
* ().startsWith(replica1.getStr(ZkStateReader.BASE_URL_PROP)) ) {
* stoppedJetty = jetty; ChaosMonkey.stop(jetty); stopped = true; break; } }
*/
if (!stopped) {
fail("Could not find jetty to stop in collection "+ testcoll + " jettys: "+sb);
fail("Could not find jetty to stop in collection " + testcoll
+ " jettys: " + sb);
}
long endAt = System.currentTimeMillis() + 3000;
boolean success = false;
while (System.currentTimeMillis() < endAt) {
testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
if(!"active".equals(testcoll.getSlice(shard1.getName()).getReplica(replica1.getName()).getStr(Slice.STATE)) ){
testcoll = getCommonCloudSolrServer().getZkStateReader()
.getClusterState().getCollection(collectionName);
if (!"active".equals(testcoll.getSlice(shard1.getName())
.getReplica(replica1.getName()).getStr(Slice.STATE))) {
success = true;
}
if (success) break;
Thread.sleep(100);
}
log.info("removed_replicas {}/{} ", shard1.getName(), replica1.getName());
removeAndWaitForReplicaGone(COLL_NAME, client, replica1, shard1.getName());
removeAndWaitForReplicaGone(collectionName, client, replica1,
shard1.getName());
ChaosMonkey.start(stoppedJetty);
log.info("restarted jetty");
Map m = makeMap("qt", "/admin/cores", "action", "status");
Map m = makeMap("qt","/admin/cores",
"action", "status");
NamedList<Object> resp = new HttpSolrServer(replica1.getStr("base_url")).request(new QueryRequest(new MapSolrParams(m)));
assertNull( "The core is up and running again" , ((NamedList)resp.get("status")).get(replica1.getStr("core")));
NamedList<Object> resp = new HttpSolrServer(replica1.getStr("base_url"))
.request(new QueryRequest(new MapSolrParams(m)));
assertNull("The core is up and running again",
((NamedList) resp.get("status")).get(replica1.getStr("core")));
}
}

View File

@ -27,18 +27,15 @@ import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrServer;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.junit.After;
import org.junit.Before;
@ -80,58 +77,49 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
checkCreatedVsState = false;
}
@Override
protected void setDistributedParams(ModifiableSolrParams params) {
if (r.nextBoolean()) {
// don't set shards, let that be figured out from the cloud state
} else {
// use shard ids rather than physical locations
StringBuilder sb = new StringBuilder();
for (int i = 0; i < shardCount; i++) {
if (i > 0)
sb.append(',');
sb.append("shard" + (i + 3));
}
params.set("shards", sb.toString());
}
}
@Override
public void doTest() throws Exception {
deleteLiveReplicaTest();
// deleteInactiveReplicaTest();
// super.printLayout();
}
private void deleteLiveReplicaTest() throws Exception {
String COLL_NAME = "delLiveColl";
String collectionName = "delLiveColl";
CloudSolrServer client = createCloudClient(null);
createColl(COLL_NAME, client);
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
try {
createCollection(collectionName, client);
waitForRecoveriesToFinish(collectionName, false);
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader()
.getClusterState().getCollection(collectionName);
Slice shard1 = null;
Replica replica1 = null;
for (Slice slice : testcoll.getSlices()) {
if ("active".equals(slice.getStr("state"))) {
shard1 = slice;
for (Replica replica : shard1.getReplicas()) if("active".equals(replica.getStr("state"))) replica1 =replica;
for (Replica replica : shard1.getReplicas())
if ("active".equals(replica.getStr("state"))) replica1 = replica;
}
}
// final Slice shard1 = testcoll.getSlices().iterator().next();
// if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
// for (Replica replica : shard1.getReplicas()) if("active".equals(replica.getStr("state"))) replica1 =replica;
// if(!shard1.getState().equals(Slice.ACTIVE))
// fail("shard is not active");
// for (Replica replica : shard1.getReplicas())
// if("active".equals(replica.getStr("state"))) replica1 =replica;
if (replica1 == null) fail("no active replicas found");
Thread.sleep(2500);//remove this later.not sure if the clusterstate is not propagated and that is why the tests are failing.SOLR-5437
removeAndWaitForReplicaGone(COLL_NAME, client, replica1, shard1.getName());
removeAndWaitForReplicaGone(collectionName, client, replica1,
shard1.getName());
} finally {
client.shutdown();
}
}
protected void removeAndWaitForReplicaGone(String COLL_NAME, CloudSolrServer client, Replica replica, String shard) throws SolrServerException, IOException, InterruptedException {
Map m = makeMap("collection", COLL_NAME,
"action", DELETEREPLICA,
"shard",shard,
"replica",replica.getName());
protected void removeAndWaitForReplicaGone(String COLL_NAME,
CloudSolrServer client, Replica replica, String shard)
throws SolrServerException, IOException, InterruptedException {
Map m = makeMap("collection", COLL_NAME, "action", DELETEREPLICA, "shard",
shard, "replica", replica.getName());
SolrParams params = new MapSolrParams(m);
SolrRequest request = new QueryRequest(params);
request.setPath("/admin/collections");
@ -140,10 +128,12 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
boolean success = false;
DocCollection testcoll = null;
while (System.currentTimeMillis() < endAt) {
testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
testcoll = getCommonCloudSolrServer().getZkStateReader()
.getClusterState().getCollection(COLL_NAME);
success = testcoll.getSlice(shard).getReplica(replica.getName()) == null;
if (success) {
log.info("replica cleaned up {}/{} core {}",shard+"/"+replica.getName(), replica.getStr("core"));
log.info("replica cleaned up {}/{} core {}",
shard + "/" + replica.getName(), replica.getStr("core"));
log.info("current state {}", testcoll);
break;
}
@ -152,7 +142,7 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
assertTrue("Replica not cleaned up", success);
}
protected void createColl(String COLL_NAME, CloudSolrServer client) throws Exception {
protected void createCollection(String COLL_NAME, CloudSolrServer client) throws Exception {
int replicationFactor = 2;
int numShards = 2;
int maxShardsPerNode = ((((numShards+1) * replicationFactor) / getCommonCloudSolrServer()
@ -164,15 +154,5 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
NUM_SLICES, numShards);
Map<String,List<Integer>> collectionInfos = new HashMap<String,List<Integer>>();
createCollection(collectionInfos, COLL_NAME, props, client);
Set<Map.Entry<String,List<Integer>>> collectionInfosEntrySet = collectionInfos.entrySet();
for (Map.Entry<String,List<Integer>> entry : collectionInfosEntrySet) {
String collection = entry.getKey();
List<Integer> list = entry.getValue();
checkForCollection(collection, list, null);
String url = getUrlFromZk(getCommonCloudSolrServer().getZkStateReader().getClusterState(), collection);
HttpSolrServer collectionClient = new HttpSolrServer(url);
// poll for a second - it can take a moment before we are ready to serve
waitForNon403or404or503(collectionClient);
}
}
}