mirror of https://github.com/apache/lucene.git
SOLR-5437: DeleteReplicaTest fails constantly both locally and in jenkins
SOLR-5486: Cleanup and harden DeleteInactiveReplicaTest. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1544838 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2ef13e06f0
commit
b892c597f7
|
@ -287,35 +287,34 @@ public class OverseerCollectionProcessor implements Runnable, ClosableThread {
|
||||||
|
|
||||||
String baseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP);
|
String baseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP);
|
||||||
String core = replica.getStr(ZkStateReader.CORE_NAME_PROP);
|
String core = replica.getStr(ZkStateReader.CORE_NAME_PROP);
|
||||||
//assume the core exists and try to unload it
|
|
||||||
if (!Slice.ACTIVE.equals(replica.getStr(Slice.STATE))) {
|
|
||||||
deleteCoreNode(collectionName, replicaName, replica, core);
|
|
||||||
if(waitForCoreNodeGone(collectionName, shard, replicaName)) return;
|
|
||||||
} else {
|
|
||||||
Map m = ZkNodeProps.makeMap("qt", adminPath, CoreAdminParams.ACTION,
|
|
||||||
CoreAdminAction.UNLOAD.toString(), CoreAdminParams.CORE, core);
|
|
||||||
|
|
||||||
ShardRequest sreq = new ShardRequest();
|
// assume the core exists and try to unload it
|
||||||
sreq.purpose = 1;
|
Map m = ZkNodeProps.makeMap("qt", adminPath, CoreAdminParams.ACTION,
|
||||||
if (baseUrl.startsWith("http://")) baseUrl = baseUrl.substring(7);
|
CoreAdminAction.UNLOAD.toString(), CoreAdminParams.CORE, core);
|
||||||
sreq.shards = new String[]{baseUrl};
|
|
||||||
sreq.actualShards = sreq.shards;
|
|
||||||
sreq.params = new ModifiableSolrParams(new MapSolrParams(m) );
|
|
||||||
try {
|
|
||||||
shardHandler.submit(sreq, baseUrl, sreq.params);
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.info("Exception trying to unload core "+sreq,e);
|
|
||||||
}
|
|
||||||
if (waitForCoreNodeGone(collectionName, shard, replicaName)) return;//check if the core unload removed the corenode zk enry
|
|
||||||
deleteCoreNode(collectionName, replicaName, replica, core); // this could be because the core is gone but not updated in ZK yet (race condition)
|
|
||||||
if(waitForCoreNodeGone(collectionName, shard, replicaName)) return;
|
|
||||||
|
|
||||||
|
ShardRequest sreq = new ShardRequest();
|
||||||
|
sreq.purpose = 1;
|
||||||
|
if (baseUrl.startsWith("http://")) baseUrl = baseUrl.substring(7);
|
||||||
|
sreq.shards = new String[] {baseUrl};
|
||||||
|
sreq.actualShards = sreq.shards;
|
||||||
|
sreq.params = new ModifiableSolrParams(new MapSolrParams(m));
|
||||||
|
try {
|
||||||
|
shardHandler.submit(sreq, baseUrl, sreq.params);
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Exception trying to unload core " + sreq, e);
|
||||||
}
|
}
|
||||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Could not remove replica : "+collectionName+"/"+shard+"/"+replicaName);
|
|
||||||
|
collectShardResponses(!Slice.ACTIVE.equals(replica.getStr(Slice.STATE)) ? new NamedList() : results, false, null);
|
||||||
|
|
||||||
|
if (waitForCoreNodeGone(collectionName, shard, replicaName, 5000)) return;//check if the core unload removed the corenode zk enry
|
||||||
|
deleteCoreNode(collectionName, replicaName, replica, core); // try and ensure core info is removed from clusterstate
|
||||||
|
if(waitForCoreNodeGone(collectionName, shard, replicaName, 30000)) return;
|
||||||
|
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, "Could not remove replica : " + collectionName + "/" + shard+"/" + replicaName);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName) throws InterruptedException {
|
private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName, int timeoutms) throws InterruptedException {
|
||||||
long waitUntil = System.currentTimeMillis() + 30000;
|
long waitUntil = System.currentTimeMillis() + timeoutms;
|
||||||
boolean deleted = false;
|
boolean deleted = false;
|
||||||
while (System.currentTimeMillis() < waitUntil) {
|
while (System.currentTimeMillis() < waitUntil) {
|
||||||
Thread.sleep(100);
|
Thread.sleep(100);
|
||||||
|
|
|
@ -596,6 +596,15 @@ public class CoreAdminHandler extends RequestHandlerBase {
|
||||||
"No such core exists '" + cname + "'");
|
"No such core exists '" + cname + "'");
|
||||||
} else {
|
} else {
|
||||||
if (coreContainer.getZkController() != null) {
|
if (coreContainer.getZkController() != null) {
|
||||||
|
// we are unloading, cancel any ongoing recovery
|
||||||
|
// so there are no races to publish state
|
||||||
|
// we will try to cancel again later before close
|
||||||
|
if (core != null) {
|
||||||
|
if (coreContainer.getZkController() != null) {
|
||||||
|
core.getSolrCoreState().cancelRecovery();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
log.info("Unregistering core " + core.getName() + " from cloudstate.");
|
log.info("Unregistering core " + core.getName() + " from cloudstate.");
|
||||||
try {
|
try {
|
||||||
coreContainer.getZkController().unregister(cname,
|
coreContainer.getZkController().unregister(cname,
|
||||||
|
|
|
@ -55,18 +55,22 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
|
||||||
client.shutdown();
|
client.shutdown();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void deleteInactiveReplicaTest() throws Exception{
|
private void deleteInactiveReplicaTest() throws Exception {
|
||||||
String COLL_NAME = "delDeadColl";
|
String collectionName = "delDeadColl";
|
||||||
|
|
||||||
createColl(COLL_NAME, client);
|
createCollection(collectionName, client);
|
||||||
|
|
||||||
|
waitForRecoveriesToFinish(collectionName, false);
|
||||||
|
|
||||||
boolean stopped = false;
|
boolean stopped = false;
|
||||||
JettySolrRunner stoppedJetty = null;
|
JettySolrRunner stoppedJetty = null;
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
Replica replica1=null;
|
Replica replica1 = null;
|
||||||
Slice shard1 = null;
|
Slice shard1 = null;
|
||||||
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
|
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader()
|
||||||
for (JettySolrRunner jetty : jettys) sb.append(jetty.getBaseUrl()).append(",");
|
.getClusterState().getCollection(collectionName);
|
||||||
|
for (JettySolrRunner jetty : jettys)
|
||||||
|
sb.append(jetty.getBaseUrl()).append(",");
|
||||||
|
|
||||||
for (Slice slice : testcoll.getActiveSlices()) {
|
for (Slice slice : testcoll.getActiveSlices()) {
|
||||||
for (Replica replica : slice.getReplicas())
|
for (Replica replica : slice.getReplicas())
|
||||||
|
@ -77,7 +81,8 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (baseUrl.toString().startsWith(replica.getStr(ZkStateReader.BASE_URL_PROP))) {
|
if (baseUrl.toString().startsWith(
|
||||||
|
replica.getStr(ZkStateReader.BASE_URL_PROP))) {
|
||||||
stoppedJetty = jetty;
|
stoppedJetty = jetty;
|
||||||
ChaosMonkey.stop(jetty);
|
ChaosMonkey.stop(jetty);
|
||||||
replica1 = replica;
|
replica1 = replica;
|
||||||
|
@ -88,46 +93,47 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*final Slice shard1 = testcoll.getSlices().iterator().next();
|
/*
|
||||||
if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
|
* final Slice shard1 = testcoll.getSlices().iterator().next();
|
||||||
Replica replica1 = shard1.getReplicas().iterator().next();
|
* if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
|
||||||
JettySolrRunner stoppedJetty = null;
|
* Replica replica1 = shard1.getReplicas().iterator().next();
|
||||||
StringBuilder sb = new StringBuilder();
|
* JettySolrRunner stoppedJetty = null; StringBuilder sb = new
|
||||||
for (JettySolrRunner jetty : jettys) {
|
* StringBuilder(); for (JettySolrRunner jetty : jettys) {
|
||||||
sb.append(jetty.getBaseUrl()).append(",");
|
* sb.append(jetty.getBaseUrl()).append(","); if(
|
||||||
if( jetty.getBaseUrl().toString().startsWith(replica1.getStr(ZkStateReader.BASE_URL_PROP)) ) {
|
* jetty.getBaseUrl().toString
|
||||||
stoppedJetty = jetty;
|
* ().startsWith(replica1.getStr(ZkStateReader.BASE_URL_PROP)) ) {
|
||||||
ChaosMonkey.stop(jetty);
|
* stoppedJetty = jetty; ChaosMonkey.stop(jetty); stopped = true; break; } }
|
||||||
stopped = true;
|
*/
|
||||||
break;
|
if (!stopped) {
|
||||||
}
|
fail("Could not find jetty to stop in collection " + testcoll
|
||||||
}*/
|
+ " jettys: " + sb);
|
||||||
if(!stopped){
|
|
||||||
fail("Could not find jetty to stop in collection "+ testcoll + " jettys: "+sb);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
long endAt = System.currentTimeMillis()+3000;
|
long endAt = System.currentTimeMillis() + 3000;
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
while(System.currentTimeMillis() < endAt){
|
while (System.currentTimeMillis() < endAt) {
|
||||||
testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
|
testcoll = getCommonCloudSolrServer().getZkStateReader()
|
||||||
if(!"active".equals(testcoll.getSlice(shard1.getName()).getReplica(replica1.getName()).getStr(Slice.STATE)) ){
|
.getClusterState().getCollection(collectionName);
|
||||||
success=true;
|
if (!"active".equals(testcoll.getSlice(shard1.getName())
|
||||||
|
.getReplica(replica1.getName()).getStr(Slice.STATE))) {
|
||||||
|
success = true;
|
||||||
}
|
}
|
||||||
if(success) break;
|
if (success) break;
|
||||||
Thread.sleep(100);
|
Thread.sleep(100);
|
||||||
}
|
}
|
||||||
log.info("removed_replicas {}/{} ",shard1.getName(),replica1.getName());
|
log.info("removed_replicas {}/{} ", shard1.getName(), replica1.getName());
|
||||||
removeAndWaitForReplicaGone(COLL_NAME, client, replica1, shard1.getName());
|
removeAndWaitForReplicaGone(collectionName, client, replica1,
|
||||||
|
shard1.getName());
|
||||||
|
|
||||||
ChaosMonkey.start(stoppedJetty);
|
ChaosMonkey.start(stoppedJetty);
|
||||||
log.info("restarted jetty");
|
log.info("restarted jetty");
|
||||||
|
|
||||||
|
Map m = makeMap("qt", "/admin/cores", "action", "status");
|
||||||
|
|
||||||
Map m = makeMap("qt","/admin/cores",
|
NamedList<Object> resp = new HttpSolrServer(replica1.getStr("base_url"))
|
||||||
"action", "status");
|
.request(new QueryRequest(new MapSolrParams(m)));
|
||||||
|
assertNull("The core is up and running again",
|
||||||
NamedList<Object> resp = new HttpSolrServer(replica1.getStr("base_url")).request(new QueryRequest(new MapSolrParams(m)));
|
((NamedList) resp.get("status")).get(replica1.getStr("core")));
|
||||||
assertNull( "The core is up and running again" , ((NamedList)resp.get("status")).get(replica1.getStr("core")));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,18 +27,15 @@ import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.solr.client.solrj.SolrRequest;
|
import org.apache.solr.client.solrj.SolrRequest;
|
||||||
import org.apache.solr.client.solrj.SolrServerException;
|
import org.apache.solr.client.solrj.SolrServerException;
|
||||||
import org.apache.solr.client.solrj.impl.CloudSolrServer;
|
import org.apache.solr.client.solrj.impl.CloudSolrServer;
|
||||||
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
|
||||||
import org.apache.solr.client.solrj.request.QueryRequest;
|
import org.apache.solr.client.solrj.request.QueryRequest;
|
||||||
import org.apache.solr.common.cloud.DocCollection;
|
import org.apache.solr.common.cloud.DocCollection;
|
||||||
import org.apache.solr.common.cloud.Replica;
|
import org.apache.solr.common.cloud.Replica;
|
||||||
import org.apache.solr.common.cloud.Slice;
|
import org.apache.solr.common.cloud.Slice;
|
||||||
import org.apache.solr.common.params.MapSolrParams;
|
import org.apache.solr.common.params.MapSolrParams;
|
||||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
|
||||||
import org.apache.solr.common.params.SolrParams;
|
import org.apache.solr.common.params.SolrParams;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
@ -80,70 +77,63 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
|
||||||
checkCreatedVsState = false;
|
checkCreatedVsState = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void setDistributedParams(ModifiableSolrParams params) {
|
|
||||||
|
|
||||||
if (r.nextBoolean()) {
|
|
||||||
// don't set shards, let that be figured out from the cloud state
|
|
||||||
} else {
|
|
||||||
// use shard ids rather than physical locations
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
for (int i = 0; i < shardCount; i++) {
|
|
||||||
if (i > 0)
|
|
||||||
sb.append(',');
|
|
||||||
sb.append("shard" + (i + 3));
|
|
||||||
}
|
|
||||||
params.set("shards", sb.toString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void doTest() throws Exception {
|
public void doTest() throws Exception {
|
||||||
deleteLiveReplicaTest();
|
deleteLiveReplicaTest();
|
||||||
// deleteInactiveReplicaTest();
|
|
||||||
// super.printLayout();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void deleteLiveReplicaTest() throws Exception{
|
private void deleteLiveReplicaTest() throws Exception {
|
||||||
String COLL_NAME = "delLiveColl";
|
String collectionName = "delLiveColl";
|
||||||
CloudSolrServer client = createCloudClient(null);
|
CloudSolrServer client = createCloudClient(null);
|
||||||
createColl(COLL_NAME, client);
|
try {
|
||||||
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
|
createCollection(collectionName, client);
|
||||||
|
|
||||||
Slice shard1 = null;
|
waitForRecoveriesToFinish(collectionName, false);
|
||||||
Replica replica1 = null;
|
|
||||||
for (Slice slice : testcoll.getSlices()) {
|
DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader()
|
||||||
if("active".equals( slice.getStr("state"))){
|
.getClusterState().getCollection(collectionName);
|
||||||
shard1 = slice;
|
|
||||||
for (Replica replica : shard1.getReplicas()) if("active".equals(replica.getStr("state"))) replica1 =replica;
|
Slice shard1 = null;
|
||||||
|
Replica replica1 = null;
|
||||||
|
for (Slice slice : testcoll.getSlices()) {
|
||||||
|
if ("active".equals(slice.getStr("state"))) {
|
||||||
|
shard1 = slice;
|
||||||
|
for (Replica replica : shard1.getReplicas())
|
||||||
|
if ("active".equals(replica.getStr("state"))) replica1 = replica;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
// final Slice shard1 = testcoll.getSlices().iterator().next();
|
||||||
|
// if(!shard1.getState().equals(Slice.ACTIVE))
|
||||||
|
// fail("shard is not active");
|
||||||
|
// for (Replica replica : shard1.getReplicas())
|
||||||
|
// if("active".equals(replica.getStr("state"))) replica1 =replica;
|
||||||
|
if (replica1 == null) fail("no active replicas found");
|
||||||
|
removeAndWaitForReplicaGone(collectionName, client, replica1,
|
||||||
|
shard1.getName());
|
||||||
|
} finally {
|
||||||
|
client.shutdown();
|
||||||
}
|
}
|
||||||
// final Slice shard1 = testcoll.getSlices().iterator().next();
|
|
||||||
// if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active");
|
|
||||||
// for (Replica replica : shard1.getReplicas()) if("active".equals(replica.getStr("state"))) replica1 =replica;
|
|
||||||
if(replica1 == null) fail("no active replicas found");
|
|
||||||
Thread.sleep(2500);//remove this later.not sure if the clusterstate is not propagated and that is why the tests are failing.SOLR-5437
|
|
||||||
removeAndWaitForReplicaGone(COLL_NAME, client, replica1, shard1.getName());
|
|
||||||
client.shutdown();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void removeAndWaitForReplicaGone(String COLL_NAME, CloudSolrServer client, Replica replica, String shard) throws SolrServerException, IOException, InterruptedException {
|
protected void removeAndWaitForReplicaGone(String COLL_NAME,
|
||||||
Map m = makeMap("collection", COLL_NAME,
|
CloudSolrServer client, Replica replica, String shard)
|
||||||
"action", DELETEREPLICA,
|
throws SolrServerException, IOException, InterruptedException {
|
||||||
"shard",shard,
|
Map m = makeMap("collection", COLL_NAME, "action", DELETEREPLICA, "shard",
|
||||||
"replica",replica.getName());
|
shard, "replica", replica.getName());
|
||||||
SolrParams params = new MapSolrParams( m);
|
SolrParams params = new MapSolrParams(m);
|
||||||
SolrRequest request = new QueryRequest(params);
|
SolrRequest request = new QueryRequest(params);
|
||||||
request.setPath("/admin/collections");
|
request.setPath("/admin/collections");
|
||||||
client.request(request);
|
client.request(request);
|
||||||
long endAt = System.currentTimeMillis()+3000;
|
long endAt = System.currentTimeMillis() + 3000;
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
DocCollection testcoll = null;
|
DocCollection testcoll = null;
|
||||||
while(System.currentTimeMillis() < endAt){
|
while (System.currentTimeMillis() < endAt) {
|
||||||
testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME);
|
testcoll = getCommonCloudSolrServer().getZkStateReader()
|
||||||
|
.getClusterState().getCollection(COLL_NAME);
|
||||||
success = testcoll.getSlice(shard).getReplica(replica.getName()) == null;
|
success = testcoll.getSlice(shard).getReplica(replica.getName()) == null;
|
||||||
if(success) {
|
if (success) {
|
||||||
log.info("replica cleaned up {}/{} core {}",shard+"/"+replica.getName(), replica.getStr("core"));
|
log.info("replica cleaned up {}/{} core {}",
|
||||||
|
shard + "/" + replica.getName(), replica.getStr("core"));
|
||||||
log.info("current state {}", testcoll);
|
log.info("current state {}", testcoll);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -152,7 +142,7 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
|
||||||
assertTrue("Replica not cleaned up", success);
|
assertTrue("Replica not cleaned up", success);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void createColl(String COLL_NAME, CloudSolrServer client) throws Exception {
|
protected void createCollection(String COLL_NAME, CloudSolrServer client) throws Exception {
|
||||||
int replicationFactor = 2;
|
int replicationFactor = 2;
|
||||||
int numShards = 2;
|
int numShards = 2;
|
||||||
int maxShardsPerNode = ((((numShards+1) * replicationFactor) / getCommonCloudSolrServer()
|
int maxShardsPerNode = ((((numShards+1) * replicationFactor) / getCommonCloudSolrServer()
|
||||||
|
@ -164,15 +154,5 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase {
|
||||||
NUM_SLICES, numShards);
|
NUM_SLICES, numShards);
|
||||||
Map<String,List<Integer>> collectionInfos = new HashMap<String,List<Integer>>();
|
Map<String,List<Integer>> collectionInfos = new HashMap<String,List<Integer>>();
|
||||||
createCollection(collectionInfos, COLL_NAME, props, client);
|
createCollection(collectionInfos, COLL_NAME, props, client);
|
||||||
Set<Map.Entry<String,List<Integer>>> collectionInfosEntrySet = collectionInfos.entrySet();
|
|
||||||
for (Map.Entry<String,List<Integer>> entry : collectionInfosEntrySet) {
|
|
||||||
String collection = entry.getKey();
|
|
||||||
List<Integer> list = entry.getValue();
|
|
||||||
checkForCollection(collection, list, null);
|
|
||||||
String url = getUrlFromZk(getCommonCloudSolrServer().getZkStateReader().getClusterState(), collection);
|
|
||||||
HttpSolrServer collectionClient = new HttpSolrServer(url);
|
|
||||||
// poll for a second - it can take a moment before we are ready to serve
|
|
||||||
waitForNon403or404or503(collectionClient);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue