SOLR-5437: DeleteReplicaTest fails constantly both locally and in jenkins

SOLR-5486: Cleanup and harden DeleteInactiveReplicaTest.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1544838 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2013-11-23 17:14:22 +00:00
parent 2ef13e06f0
commit b892c597f7
4 changed files with 127 additions and 133 deletions

View File

@ -287,35 +287,34 @@ public class OverseerCollectionProcessor implements Runnable, ClosableThread {
String baseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP); String baseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP);
String core = replica.getStr(ZkStateReader.CORE_NAME_PROP); String core = replica.getStr(ZkStateReader.CORE_NAME_PROP);
//assume the core exists and try to unload it
if (!Slice.ACTIVE.equals(replica.getStr(Slice.STATE))) { // assume the core exists and try to unload it
deleteCoreNode(collectionName, replicaName, replica, core); Map m = ZkNodeProps.makeMap("qt", adminPath, CoreAdminParams.ACTION,
if(waitForCoreNodeGone(collectionName, shard, replicaName)) return; CoreAdminAction.UNLOAD.toString(), CoreAdminParams.CORE, core);
} else {
Map m = ZkNodeProps.makeMap("qt", adminPath, CoreAdminParams.ACTION, ShardRequest sreq = new ShardRequest();
CoreAdminAction.UNLOAD.toString(), CoreAdminParams.CORE, core); sreq.purpose = 1;
if (baseUrl.startsWith("http://")) baseUrl = baseUrl.substring(7);
ShardRequest sreq = new ShardRequest(); sreq.shards = new String[] {baseUrl};
sreq.purpose = 1; sreq.actualShards = sreq.shards;
if (baseUrl.startsWith("http://")) baseUrl = baseUrl.substring(7); sreq.params = new ModifiableSolrParams(new MapSolrParams(m));
sreq.shards = new String[]{baseUrl}; try {
sreq.actualShards = sreq.shards; shardHandler.submit(sreq, baseUrl, sreq.params);
sreq.params = new ModifiableSolrParams(new MapSolrParams(m) ); } catch (Exception e) {
try { log.warn("Exception trying to unload core " + sreq, e);
shardHandler.submit(sreq, baseUrl, sreq.params);
} catch (Exception e) {
log.info("Exception trying to unload core "+sreq,e);
}
if (waitForCoreNodeGone(collectionName, shard, replicaName)) return;//check if the core unload removed the corenode zk enry
deleteCoreNode(collectionName, replicaName, replica, core); // this could be because the core is gone but not updated in ZK yet (race condition)
if(waitForCoreNodeGone(collectionName, shard, replicaName)) return;
} }
throw new SolrException(ErrorCode.SERVER_ERROR, "Could not remove replica : "+collectionName+"/"+shard+"/"+replicaName);
collectShardResponses(!Slice.ACTIVE.equals(replica.getStr(Slice.STATE)) ? new NamedList() : results, false, null);
if (waitForCoreNodeGone(collectionName, shard, replicaName, 5000)) return;//check if the core unload removed the corenode zk enry
deleteCoreNode(collectionName, replicaName, replica, core); // try and ensure core info is removed from clusterstate
if(waitForCoreNodeGone(collectionName, shard, replicaName, 30000)) return;
throw new SolrException(ErrorCode.SERVER_ERROR, "Could not remove replica : " + collectionName + "/" + shard+"/" + replicaName);
} }
private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName) throws InterruptedException { private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName, int timeoutms) throws InterruptedException {
long waitUntil = System.currentTimeMillis() + 30000; long waitUntil = System.currentTimeMillis() + timeoutms;
boolean deleted = false; boolean deleted = false;
while (System.currentTimeMillis() < waitUntil) { while (System.currentTimeMillis() < waitUntil) {
Thread.sleep(100); Thread.sleep(100);

View File

@ -596,6 +596,15 @@ public class CoreAdminHandler extends RequestHandlerBase {
"No such core exists '" + cname + "'"); "No such core exists '" + cname + "'");
} else { } else {
if (coreContainer.getZkController() != null) { if (coreContainer.getZkController() != null) {
// we are unloading, cancel any ongoing recovery
// so there are no races to publish state
// we will try to cancel again later before close
if (core != null) {
if (coreContainer.getZkController() != null) {
core.getSolrCoreState().cancelRecovery();
}
}
log.info("Unregistering core " + core.getName() + " from cloudstate."); log.info("Unregistering core " + core.getName() + " from cloudstate.");
try { try {
coreContainer.getZkController().unregister(cname, coreContainer.getZkController().unregister(cname,

View File

@ -55,19 +55,23 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
client.shutdown(); client.shutdown();
} }
private void deleteInactiveReplicaTest() throws Exception{ private void deleteInactiveReplicaTest() throws Exception {
String COLL_NAME = "delDeadColl"; String collectionName = "delDeadColl";
createColl(COLL_NAME, client); createCollection(collectionName, client);
waitForRecoveriesToFinish(collectionName, false);
boolean stopped = false; boolean stopped = false;
JettySolrRunner stoppedJetty = null; JettySolrRunner stoppedJetty = null;
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
Replica replica1=null; Replica replica1 = null;
Slice shard1 = null; Slice shard1 = null;
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME); DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader()
for (JettySolrRunner jetty : jettys) sb.append(jetty.getBaseUrl()).append(","); .getClusterState().getCollection(collectionName);
for (JettySolrRunner jetty : jettys)
sb.append(jetty.getBaseUrl()).append(",");
for (Slice slice : testcoll.getActiveSlices()) { for (Slice slice : testcoll.getActiveSlices()) {
for (Replica replica : slice.getReplicas()) for (Replica replica : slice.getReplicas())
for (JettySolrRunner jetty : jettys) { for (JettySolrRunner jetty : jettys) {
@ -77,7 +81,8 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
} catch (Exception e) { } catch (Exception e) {
continue; continue;
} }
if (baseUrl.toString().startsWith(replica.getStr(ZkStateReader.BASE_URL_PROP))) { if (baseUrl.toString().startsWith(
replica.getStr(ZkStateReader.BASE_URL_PROP))) {
stoppedJetty = jetty; stoppedJetty = jetty;
ChaosMonkey.stop(jetty); ChaosMonkey.stop(jetty);
replica1 = replica; replica1 = replica;
@ -87,47 +92,48 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
} }
} }
} }
/*final Slice shard1 = testcoll.getSlices().iterator().next(); /*
if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active"); * final Slice shard1 = testcoll.getSlices().iterator().next();
Replica replica1 = shard1.getReplicas().iterator().next(); * if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
JettySolrRunner stoppedJetty = null; * Replica replica1 = shard1.getReplicas().iterator().next();
StringBuilder sb = new StringBuilder(); * JettySolrRunner stoppedJetty = null; StringBuilder sb = new
for (JettySolrRunner jetty : jettys) { * StringBuilder(); for (JettySolrRunner jetty : jettys) {
sb.append(jetty.getBaseUrl()).append(","); * sb.append(jetty.getBaseUrl()).append(","); if(
if( jetty.getBaseUrl().toString().startsWith(replica1.getStr(ZkStateReader.BASE_URL_PROP)) ) { * jetty.getBaseUrl().toString
stoppedJetty = jetty; * ().startsWith(replica1.getStr(ZkStateReader.BASE_URL_PROP)) ) {
ChaosMonkey.stop(jetty); * stoppedJetty = jetty; ChaosMonkey.stop(jetty); stopped = true; break; } }
stopped = true; */
break; if (!stopped) {
} fail("Could not find jetty to stop in collection " + testcoll
}*/ + " jettys: " + sb);
if(!stopped){
fail("Could not find jetty to stop in collection "+ testcoll + " jettys: "+sb);
} }
long endAt = System.currentTimeMillis()+3000; long endAt = System.currentTimeMillis() + 3000;
boolean success = false; boolean success = false;
while(System.currentTimeMillis() < endAt){ while (System.currentTimeMillis() < endAt) {
testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME); testcoll = getCommonCloudSolrServer().getZkStateReader()
if(!"active".equals(testcoll.getSlice(shard1.getName()).getReplica(replica1.getName()).getStr(Slice.STATE)) ){ .getClusterState().getCollection(collectionName);
success=true; if (!"active".equals(testcoll.getSlice(shard1.getName())
.getReplica(replica1.getName()).getStr(Slice.STATE))) {
success = true;
} }
if(success) break; if (success) break;
Thread.sleep(100); Thread.sleep(100);
} }
log.info("removed_replicas {}/{} ",shard1.getName(),replica1.getName()); log.info("removed_replicas {}/{} ", shard1.getName(), replica1.getName());
removeAndWaitForReplicaGone(COLL_NAME, client, replica1, shard1.getName()); removeAndWaitForReplicaGone(collectionName, client, replica1,
shard1.getName());
ChaosMonkey.start(stoppedJetty); ChaosMonkey.start(stoppedJetty);
log.info("restarted jetty"); log.info("restarted jetty");
Map m = makeMap("qt", "/admin/cores", "action", "status");
Map m = makeMap("qt","/admin/cores",
"action", "status"); NamedList<Object> resp = new HttpSolrServer(replica1.getStr("base_url"))
.request(new QueryRequest(new MapSolrParams(m)));
NamedList<Object> resp = new HttpSolrServer(replica1.getStr("base_url")).request(new QueryRequest(new MapSolrParams(m))); assertNull("The core is up and running again",
assertNull( "The core is up and running again" , ((NamedList)resp.get("status")).get(replica1.getStr("core"))); ((NamedList) resp.get("status")).get(replica1.getStr("core")));
} }
} }

View File

@ -27,18 +27,15 @@ import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrServer; import org.apache.solr.client.solrj.impl.CloudSolrServer;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.params.MapSolrParams; import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SolrParams;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
@ -80,70 +77,63 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
checkCreatedVsState = false; checkCreatedVsState = false;
} }
@Override
protected void setDistributedParams(ModifiableSolrParams params) {
if (r.nextBoolean()) {
// don't set shards, let that be figured out from the cloud state
} else {
// use shard ids rather than physical locations
StringBuilder sb = new StringBuilder();
for (int i = 0; i < shardCount; i++) {
if (i > 0)
sb.append(',');
sb.append("shard" + (i + 3));
}
params.set("shards", sb.toString());
}
}
@Override @Override
public void doTest() throws Exception { public void doTest() throws Exception {
deleteLiveReplicaTest(); deleteLiveReplicaTest();
// deleteInactiveReplicaTest();
// super.printLayout();
} }
private void deleteLiveReplicaTest() throws Exception{ private void deleteLiveReplicaTest() throws Exception {
String COLL_NAME = "delLiveColl"; String collectionName = "delLiveColl";
CloudSolrServer client = createCloudClient(null); CloudSolrServer client = createCloudClient(null);
createColl(COLL_NAME, client); try {
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME); createCollection(collectionName, client);
Slice shard1 = null; waitForRecoveriesToFinish(collectionName, false);
Replica replica1 = null;
for (Slice slice : testcoll.getSlices()) { DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader()
if("active".equals( slice.getStr("state"))){ .getClusterState().getCollection(collectionName);
shard1 = slice;
for (Replica replica : shard1.getReplicas()) if("active".equals(replica.getStr("state"))) replica1 =replica; Slice shard1 = null;
Replica replica1 = null;
for (Slice slice : testcoll.getSlices()) {
if ("active".equals(slice.getStr("state"))) {
shard1 = slice;
for (Replica replica : shard1.getReplicas())
if ("active".equals(replica.getStr("state"))) replica1 = replica;
}
} }
// final Slice shard1 = testcoll.getSlices().iterator().next();
// if(!shard1.getState().equals(Slice.ACTIVE))
// fail("shard is not active");
// for (Replica replica : shard1.getReplicas())
// if("active".equals(replica.getStr("state"))) replica1 =replica;
if (replica1 == null) fail("no active replicas found");
removeAndWaitForReplicaGone(collectionName, client, replica1,
shard1.getName());
} finally {
client.shutdown();
} }
// final Slice shard1 = testcoll.getSlices().iterator().next();
// if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
// for (Replica replica : shard1.getReplicas()) if("active".equals(replica.getStr("state"))) replica1 =replica;
if(replica1 == null) fail("no active replicas found");
Thread.sleep(2500);//remove this later.not sure if the clusterstate is not propagated and that is why the tests are failing.SOLR-5437
removeAndWaitForReplicaGone(COLL_NAME, client, replica1, shard1.getName());
client.shutdown();
} }
protected void removeAndWaitForReplicaGone(String COLL_NAME, CloudSolrServer client, Replica replica, String shard) throws SolrServerException, IOException, InterruptedException { protected void removeAndWaitForReplicaGone(String COLL_NAME,
Map m = makeMap("collection", COLL_NAME, CloudSolrServer client, Replica replica, String shard)
"action", DELETEREPLICA, throws SolrServerException, IOException, InterruptedException {
"shard",shard, Map m = makeMap("collection", COLL_NAME, "action", DELETEREPLICA, "shard",
"replica",replica.getName()); shard, "replica", replica.getName());
SolrParams params = new MapSolrParams( m); SolrParams params = new MapSolrParams(m);
SolrRequest request = new QueryRequest(params); SolrRequest request = new QueryRequest(params);
request.setPath("/admin/collections"); request.setPath("/admin/collections");
client.request(request); client.request(request);
long endAt = System.currentTimeMillis()+3000; long endAt = System.currentTimeMillis() + 3000;
boolean success = false; boolean success = false;
DocCollection testcoll = null; DocCollection testcoll = null;
while(System.currentTimeMillis() < endAt){ while (System.currentTimeMillis() < endAt) {
testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME); testcoll = getCommonCloudSolrServer().getZkStateReader()
.getClusterState().getCollection(COLL_NAME);
success = testcoll.getSlice(shard).getReplica(replica.getName()) == null; success = testcoll.getSlice(shard).getReplica(replica.getName()) == null;
if(success) { if (success) {
log.info("replica cleaned up {}/{} core {}",shard+"/"+replica.getName(), replica.getStr("core")); log.info("replica cleaned up {}/{} core {}",
shard + "/" + replica.getName(), replica.getStr("core"));
log.info("current state {}", testcoll); log.info("current state {}", testcoll);
break; break;
} }
@ -152,7 +142,7 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
assertTrue("Replica not cleaned up", success); assertTrue("Replica not cleaned up", success);
} }
protected void createColl(String COLL_NAME, CloudSolrServer client) throws Exception { protected void createCollection(String COLL_NAME, CloudSolrServer client) throws Exception {
int replicationFactor = 2; int replicationFactor = 2;
int numShards = 2; int numShards = 2;
int maxShardsPerNode = ((((numShards+1) * replicationFactor) / getCommonCloudSolrServer() int maxShardsPerNode = ((((numShards+1) * replicationFactor) / getCommonCloudSolrServer()
@ -164,15 +154,5 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
NUM_SLICES, numShards); NUM_SLICES, numShards);
Map<String,List<Integer>> collectionInfos = new HashMap<String,List<Integer>>(); Map<String,List<Integer>> collectionInfos = new HashMap<String,List<Integer>>();
createCollection(collectionInfos, COLL_NAME, props, client); createCollection(collectionInfos, COLL_NAME, props, client);
Set<Map.Entry<String,List<Integer>>> collectionInfosEntrySet = collectionInfos.entrySet();
for (Map.Entry<String,List<Integer>> entry : collectionInfosEntrySet) {
String collection = entry.getKey();
List<Integer> list = entry.getValue();
checkForCollection(collection, list, null);
String url = getUrlFromZk(getCommonCloudSolrServer().getZkStateReader().getClusterState(), collection);
HttpSolrServer collectionClient = new HttpSolrServer(url);
// poll for a second - it can take a moment before we are ready to serve
waitForNon403or404or503(collectionClient);
}
} }
} }