mirror of https://github.com/apache/lucene.git
SOLR-5437: DeleteReplicaTest fails constantly both locally and in jenkins
SOLR-5486: Cleanup and harden DeleteInactiveReplicaTest. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1544838 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2ef13e06f0
commit
b892c597f7
|
@ -287,11 +287,8 @@ public class OverseerCollectionProcessor implements Runnable, ClosableThread {
|
|||
|
||||
String baseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP);
|
||||
String core = replica.getStr(ZkStateReader.CORE_NAME_PROP);
|
||||
|
||||
// assume the core exists and try to unload it
|
||||
if (!Slice.ACTIVE.equals(replica.getStr(Slice.STATE))) {
|
||||
deleteCoreNode(collectionName, replicaName, replica, core);
|
||||
if(waitForCoreNodeGone(collectionName, shard, replicaName)) return;
|
||||
} else {
|
||||
Map m = ZkNodeProps.makeMap("qt", adminPath, CoreAdminParams.ACTION,
|
||||
CoreAdminAction.UNLOAD.toString(), CoreAdminParams.CORE, core);
|
||||
|
||||
|
@ -304,18 +301,20 @@ public class OverseerCollectionProcessor implements Runnable, ClosableThread {
|
|||
try {
|
||||
shardHandler.submit(sreq, baseUrl, sreq.params);
|
||||
} catch (Exception e) {
|
||||
log.info("Exception trying to unload core "+sreq,e);
|
||||
log.warn("Exception trying to unload core " + sreq, e);
|
||||
}
|
||||
if (waitForCoreNodeGone(collectionName, shard, replicaName)) return;//check if the core unload removed the corenode zk enry
|
||||
deleteCoreNode(collectionName, replicaName, replica, core); // this could be because the core is gone but not updated in ZK yet (race condition)
|
||||
if(waitForCoreNodeGone(collectionName, shard, replicaName)) return;
|
||||
|
||||
}
|
||||
collectShardResponses(!Slice.ACTIVE.equals(replica.getStr(Slice.STATE)) ? new NamedList() : results, false, null);
|
||||
|
||||
if (waitForCoreNodeGone(collectionName, shard, replicaName, 5000)) return;//check if the core unload removed the corenode zk enry
|
||||
deleteCoreNode(collectionName, replicaName, replica, core); // try and ensure core info is removed from clusterstate
|
||||
if(waitForCoreNodeGone(collectionName, shard, replicaName, 30000)) return;
|
||||
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Could not remove replica : " + collectionName + "/" + shard+"/" + replicaName);
|
||||
}
|
||||
|
||||
private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName) throws InterruptedException {
|
||||
long waitUntil = System.currentTimeMillis() + 30000;
|
||||
private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName, int timeoutms) throws InterruptedException {
|
||||
long waitUntil = System.currentTimeMillis() + timeoutms;
|
||||
boolean deleted = false;
|
||||
while (System.currentTimeMillis() < waitUntil) {
|
||||
Thread.sleep(100);
|
||||
|
|
|
@ -596,6 +596,15 @@ public class CoreAdminHandler extends RequestHandlerBase {
|
|||
"No such core exists '" + cname + "'");
|
||||
} else {
|
||||
if (coreContainer.getZkController() != null) {
|
||||
// we are unloading, cancel any ongoing recovery
|
||||
// so there are no races to publish state
|
||||
// we will try to cancel again later before close
|
||||
if (core != null) {
|
||||
if (coreContainer.getZkController() != null) {
|
||||
core.getSolrCoreState().cancelRecovery();
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Unregistering core " + core.getName() + " from cloudstate.");
|
||||
try {
|
||||
coreContainer.getZkController().unregister(cname,
|
||||
|
|
|
@ -56,17 +56,21 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
|
|||
}
|
||||
|
||||
private void deleteInactiveReplicaTest() throws Exception {
|
||||
String COLL_NAME = "delDeadColl";
|
||||
String collectionName = "delDeadColl";
|
||||
|
||||
createColl(COLL_NAME, client);
|
||||
createCollection(collectionName, client);
|
||||
|
||||
waitForRecoveriesToFinish(collectionName, false);
|
||||
|
||||
boolean stopped = false;
|
||||
JettySolrRunner stoppedJetty = null;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Replica replica1 = null;
|
||||
Slice shard1 = null;
|
||||
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
|
||||
for (JettySolrRunner jetty : jettys) sb.append(jetty.getBaseUrl()).append(",");
|
||||
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader()
|
||||
.getClusterState().getCollection(collectionName);
|
||||
for (JettySolrRunner jetty : jettys)
|
||||
sb.append(jetty.getBaseUrl()).append(",");
|
||||
|
||||
for (Slice slice : testcoll.getActiveSlices()) {
|
||||
for (Replica replica : slice.getReplicas())
|
||||
|
@ -77,7 +81,8 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
|
|||
} catch (Exception e) {
|
||||
continue;
|
||||
}
|
||||
if (baseUrl.toString().startsWith(replica.getStr(ZkStateReader.BASE_URL_PROP))) {
|
||||
if (baseUrl.toString().startsWith(
|
||||
replica.getStr(ZkStateReader.BASE_URL_PROP))) {
|
||||
stoppedJetty = jetty;
|
||||
ChaosMonkey.stop(jetty);
|
||||
replica1 = replica;
|
||||
|
@ -88,46 +93,47 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
|
|||
}
|
||||
}
|
||||
|
||||
/*final Slice shard1 = testcoll.getSlices().iterator().next();
|
||||
if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
|
||||
Replica replica1 = shard1.getReplicas().iterator().next();
|
||||
JettySolrRunner stoppedJetty = null;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (JettySolrRunner jetty : jettys) {
|
||||
sb.append(jetty.getBaseUrl()).append(",");
|
||||
if( jetty.getBaseUrl().toString().startsWith(replica1.getStr(ZkStateReader.BASE_URL_PROP)) ) {
|
||||
stoppedJetty = jetty;
|
||||
ChaosMonkey.stop(jetty);
|
||||
stopped = true;
|
||||
break;
|
||||
}
|
||||
}*/
|
||||
/*
|
||||
* final Slice shard1 = testcoll.getSlices().iterator().next();
|
||||
* if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
|
||||
* Replica replica1 = shard1.getReplicas().iterator().next();
|
||||
* JettySolrRunner stoppedJetty = null; StringBuilder sb = new
|
||||
* StringBuilder(); for (JettySolrRunner jetty : jettys) {
|
||||
* sb.append(jetty.getBaseUrl()).append(","); if(
|
||||
* jetty.getBaseUrl().toString
|
||||
* ().startsWith(replica1.getStr(ZkStateReader.BASE_URL_PROP)) ) {
|
||||
* stoppedJetty = jetty; ChaosMonkey.stop(jetty); stopped = true; break; } }
|
||||
*/
|
||||
if (!stopped) {
|
||||
fail("Could not find jetty to stop in collection "+ testcoll + " jettys: "+sb);
|
||||
fail("Could not find jetty to stop in collection " + testcoll
|
||||
+ " jettys: " + sb);
|
||||
}
|
||||
|
||||
long endAt = System.currentTimeMillis() + 3000;
|
||||
boolean success = false;
|
||||
while (System.currentTimeMillis() < endAt) {
|
||||
testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
|
||||
if(!"active".equals(testcoll.getSlice(shard1.getName()).getReplica(replica1.getName()).getStr(Slice.STATE)) ){
|
||||
testcoll = getCommonCloudSolrServer().getZkStateReader()
|
||||
.getClusterState().getCollection(collectionName);
|
||||
if (!"active".equals(testcoll.getSlice(shard1.getName())
|
||||
.getReplica(replica1.getName()).getStr(Slice.STATE))) {
|
||||
success = true;
|
||||
}
|
||||
if (success) break;
|
||||
Thread.sleep(100);
|
||||
}
|
||||
log.info("removed_replicas {}/{} ", shard1.getName(), replica1.getName());
|
||||
removeAndWaitForReplicaGone(COLL_NAME, client, replica1, shard1.getName());
|
||||
removeAndWaitForReplicaGone(collectionName, client, replica1,
|
||||
shard1.getName());
|
||||
|
||||
ChaosMonkey.start(stoppedJetty);
|
||||
log.info("restarted jetty");
|
||||
|
||||
Map m = makeMap("qt", "/admin/cores", "action", "status");
|
||||
|
||||
Map m = makeMap("qt","/admin/cores",
|
||||
"action", "status");
|
||||
|
||||
NamedList<Object> resp = new HttpSolrServer(replica1.getStr("base_url")).request(new QueryRequest(new MapSolrParams(m)));
|
||||
assertNull( "The core is up and running again" , ((NamedList)resp.get("status")).get(replica1.getStr("core")));
|
||||
NamedList<Object> resp = new HttpSolrServer(replica1.getStr("base_url"))
|
||||
.request(new QueryRequest(new MapSolrParams(m)));
|
||||
assertNull("The core is up and running again",
|
||||
((NamedList) resp.get("status")).get(replica1.getStr("core")));
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,18 +27,15 @@ import java.io.IOException;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.solr.client.solrj.SolrRequest;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrServer;
|
||||
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
||||
import org.apache.solr.client.solrj.request.QueryRequest;
|
||||
import org.apache.solr.common.cloud.DocCollection;
|
||||
import org.apache.solr.common.cloud.Replica;
|
||||
import org.apache.solr.common.cloud.Slice;
|
||||
import org.apache.solr.common.params.MapSolrParams;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
|
@ -80,58 +77,49 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
|
|||
checkCreatedVsState = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setDistributedParams(ModifiableSolrParams params) {
|
||||
|
||||
if (r.nextBoolean()) {
|
||||
// don't set shards, let that be figured out from the cloud state
|
||||
} else {
|
||||
// use shard ids rather than physical locations
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < shardCount; i++) {
|
||||
if (i > 0)
|
||||
sb.append(',');
|
||||
sb.append("shard" + (i + 3));
|
||||
}
|
||||
params.set("shards", sb.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void doTest() throws Exception {
|
||||
deleteLiveReplicaTest();
|
||||
// deleteInactiveReplicaTest();
|
||||
// super.printLayout();
|
||||
}
|
||||
|
||||
private void deleteLiveReplicaTest() throws Exception {
|
||||
String COLL_NAME = "delLiveColl";
|
||||
String collectionName = "delLiveColl";
|
||||
CloudSolrServer client = createCloudClient(null);
|
||||
createColl(COLL_NAME, client);
|
||||
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
|
||||
try {
|
||||
createCollection(collectionName, client);
|
||||
|
||||
waitForRecoveriesToFinish(collectionName, false);
|
||||
|
||||
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader()
|
||||
.getClusterState().getCollection(collectionName);
|
||||
|
||||
Slice shard1 = null;
|
||||
Replica replica1 = null;
|
||||
for (Slice slice : testcoll.getSlices()) {
|
||||
if ("active".equals(slice.getStr("state"))) {
|
||||
shard1 = slice;
|
||||
for (Replica replica : shard1.getReplicas()) if("active".equals(replica.getStr("state"))) replica1 =replica;
|
||||
for (Replica replica : shard1.getReplicas())
|
||||
if ("active".equals(replica.getStr("state"))) replica1 = replica;
|
||||
}
|
||||
}
|
||||
// final Slice shard1 = testcoll.getSlices().iterator().next();
|
||||
// if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
|
||||
// for (Replica replica : shard1.getReplicas()) if("active".equals(replica.getStr("state"))) replica1 =replica;
|
||||
// if(!shard1.getState().equals(Slice.ACTIVE))
|
||||
// fail("shard is not active");
|
||||
// for (Replica replica : shard1.getReplicas())
|
||||
// if("active".equals(replica.getStr("state"))) replica1 =replica;
|
||||
if (replica1 == null) fail("no active replicas found");
|
||||
Thread.sleep(2500);//remove this later.not sure if the clusterstate is not propagated and that is why the tests are failing.SOLR-5437
|
||||
removeAndWaitForReplicaGone(COLL_NAME, client, replica1, shard1.getName());
|
||||
removeAndWaitForReplicaGone(collectionName, client, replica1,
|
||||
shard1.getName());
|
||||
} finally {
|
||||
client.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
protected void removeAndWaitForReplicaGone(String COLL_NAME, CloudSolrServer client, Replica replica, String shard) throws SolrServerException, IOException, InterruptedException {
|
||||
Map m = makeMap("collection", COLL_NAME,
|
||||
"action", DELETEREPLICA,
|
||||
"shard",shard,
|
||||
"replica",replica.getName());
|
||||
protected void removeAndWaitForReplicaGone(String COLL_NAME,
|
||||
CloudSolrServer client, Replica replica, String shard)
|
||||
throws SolrServerException, IOException, InterruptedException {
|
||||
Map m = makeMap("collection", COLL_NAME, "action", DELETEREPLICA, "shard",
|
||||
shard, "replica", replica.getName());
|
||||
SolrParams params = new MapSolrParams(m);
|
||||
SolrRequest request = new QueryRequest(params);
|
||||
request.setPath("/admin/collections");
|
||||
|
@ -140,10 +128,12 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
|
|||
boolean success = false;
|
||||
DocCollection testcoll = null;
|
||||
while (System.currentTimeMillis() < endAt) {
|
||||
testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
|
||||
testcoll = getCommonCloudSolrServer().getZkStateReader()
|
||||
.getClusterState().getCollection(COLL_NAME);
|
||||
success = testcoll.getSlice(shard).getReplica(replica.getName()) == null;
|
||||
if (success) {
|
||||
log.info("replica cleaned up {}/{} core {}",shard+"/"+replica.getName(), replica.getStr("core"));
|
||||
log.info("replica cleaned up {}/{} core {}",
|
||||
shard + "/" + replica.getName(), replica.getStr("core"));
|
||||
log.info("current state {}", testcoll);
|
||||
break;
|
||||
}
|
||||
|
@ -152,7 +142,7 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
|
|||
assertTrue("Replica not cleaned up", success);
|
||||
}
|
||||
|
||||
protected void createColl(String COLL_NAME, CloudSolrServer client) throws Exception {
|
||||
protected void createCollection(String COLL_NAME, CloudSolrServer client) throws Exception {
|
||||
int replicationFactor = 2;
|
||||
int numShards = 2;
|
||||
int maxShardsPerNode = ((((numShards+1) * replicationFactor) / getCommonCloudSolrServer()
|
||||
|
@ -164,15 +154,5 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
|
|||
NUM_SLICES, numShards);
|
||||
Map<String,List<Integer>> collectionInfos = new HashMap<String,List<Integer>>();
|
||||
createCollection(collectionInfos, COLL_NAME, props, client);
|
||||
Set<Map.Entry<String,List<Integer>>> collectionInfosEntrySet = collectionInfos.entrySet();
|
||||
for (Map.Entry<String,List<Integer>> entry : collectionInfosEntrySet) {
|
||||
String collection = entry.getKey();
|
||||
List<Integer> list = entry.getValue();
|
||||
checkForCollection(collection, list, null);
|
||||
String url = getUrlFromZk(getCommonCloudSolrServer().getZkStateReader().getClusterState(), collection);
|
||||
HttpSolrServer collectionClient = new HttpSolrServer(url);
|
||||
// poll for a second - it can take a moment before we are ready to serve
|
||||
waitForNon403or404or503(collectionClient);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue