From b892c597f7264d1a14e791946813e44aee41d705 Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Sat, 23 Nov 2013 17:14:22 +0000 Subject: [PATCH] SOLR-5437: DeleteReplicaTest fails constantly both locally and in jenkins SOLR-5486: Cleanup and harden DeleteInactiveReplicaTest. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1544838 13f79535-47bb-0310-9956-ffa450edef68 --- .../cloud/OverseerCollectionProcessor.java | 51 +++++---- .../solr/handler/admin/CoreAdminHandler.java | 9 ++ .../solr/cloud/DeleteInactiveReplicaTest.java | 96 ++++++++-------- .../apache/solr/cloud/DeleteReplicaTest.java | 104 +++++++----------- 4 files changed, 127 insertions(+), 133 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java index 64e34f4ec96..b0ff86de6ef 100644 --- a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java +++ b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java @@ -287,35 +287,34 @@ public class OverseerCollectionProcessor implements Runnable, ClosableThread { String baseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP); String core = replica.getStr(ZkStateReader.CORE_NAME_PROP); - //assume the core exists and try to unload it - if (!Slice.ACTIVE.equals(replica.getStr(Slice.STATE))) { - deleteCoreNode(collectionName, replicaName, replica, core); - if(waitForCoreNodeGone(collectionName, shard, replicaName)) return; - } else { - Map m = ZkNodeProps.makeMap("qt", adminPath, CoreAdminParams.ACTION, - CoreAdminAction.UNLOAD.toString(), CoreAdminParams.CORE, core); - - ShardRequest sreq = new ShardRequest(); - sreq.purpose = 1; - if (baseUrl.startsWith("http://")) baseUrl = baseUrl.substring(7); - sreq.shards = new String[]{baseUrl}; - sreq.actualShards = sreq.shards; - sreq.params = new ModifiableSolrParams(new MapSolrParams(m) ); - try { - shardHandler.submit(sreq, baseUrl, sreq.params); - } catch (Exception e) { - log.info("Exception trying to unload core "+sreq,e); - } - if (waitForCoreNodeGone(collectionName, shard, replicaName)) return;//check if the core unload removed the corenode zk enry - deleteCoreNode(collectionName, replicaName, replica, core); // this could be because the core is gone but not updated in ZK yet (race condition) - if(waitForCoreNodeGone(collectionName, shard, replicaName)) return; - + + // assume the core exists and try to unload it + Map m = ZkNodeProps.makeMap("qt", adminPath, CoreAdminParams.ACTION, + CoreAdminAction.UNLOAD.toString(), CoreAdminParams.CORE, core); + + ShardRequest sreq = new ShardRequest(); + sreq.purpose = 1; + if (baseUrl.startsWith("http://")) baseUrl = baseUrl.substring(7); + sreq.shards = new String[] {baseUrl}; + sreq.actualShards = sreq.shards; + sreq.params = new ModifiableSolrParams(new MapSolrParams(m)); + try { + shardHandler.submit(sreq, baseUrl, sreq.params); + } catch (Exception e) { + log.warn("Exception trying to unload core " + sreq, e); } - throw new SolrException(ErrorCode.SERVER_ERROR, "Could not remove replica : "+collectionName+"/"+shard+"/"+replicaName); + + collectShardResponses(!Slice.ACTIVE.equals(replica.getStr(Slice.STATE)) ? new NamedList() : results, false, null); + + if (waitForCoreNodeGone(collectionName, shard, replicaName, 5000)) return;//check if the core unload removed the corenode zk enry + deleteCoreNode(collectionName, replicaName, replica, core); // try and ensure core info is removed from clusterstate + if(waitForCoreNodeGone(collectionName, shard, replicaName, 30000)) return; + + throw new SolrException(ErrorCode.SERVER_ERROR, "Could not remove replica : " + collectionName + "/" + shard+"/" + replicaName); } - private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName) throws InterruptedException { - long waitUntil = System.currentTimeMillis() + 30000; + private boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName, int timeoutms) throws InterruptedException { + long waitUntil = System.currentTimeMillis() + timeoutms; boolean deleted = false; while (System.currentTimeMillis() < waitUntil) { Thread.sleep(100); diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index 23c4c4794e6..1c5f165457e 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -596,6 +596,15 @@ public class CoreAdminHandler extends RequestHandlerBase { "No such core exists '" + cname + "'"); } else { if (coreContainer.getZkController() != null) { + // we are unloading, cancel any ongoing recovery + // so there are no races to publish state + // we will try to cancel again later before close + if (core != null) { + if (coreContainer.getZkController() != null) { + core.getSolrCoreState().cancelRecovery(); + } + } + log.info("Unregistering core " + core.getName() + " from cloudstate."); try { coreContainer.getZkController().unregister(cname, diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java index ffe89a1c84c..8a2c10db867 100644 --- a/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/DeleteInactiveReplicaTest.java @@ -55,19 +55,23 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{ client.shutdown(); } - private void deleteInactiveReplicaTest() throws Exception{ - String COLL_NAME = "delDeadColl"; - - createColl(COLL_NAME, client); - + private void deleteInactiveReplicaTest() throws Exception { + String collectionName = "delDeadColl"; + + createCollection(collectionName, client); + + waitForRecoveriesToFinish(collectionName, false); + boolean stopped = false; JettySolrRunner stoppedJetty = null; StringBuilder sb = new StringBuilder(); - Replica replica1=null; + Replica replica1 = null; Slice shard1 = null; - DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME); - for (JettySolrRunner jetty : jettys) sb.append(jetty.getBaseUrl()).append(","); - + DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader() + .getClusterState().getCollection(collectionName); + for (JettySolrRunner jetty : jettys) + sb.append(jetty.getBaseUrl()).append(","); + for (Slice slice : testcoll.getActiveSlices()) { for (Replica replica : slice.getReplicas()) for (JettySolrRunner jetty : jettys) { @@ -77,7 +81,8 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{ } catch (Exception e) { continue; } - if (baseUrl.toString().startsWith(replica.getStr(ZkStateReader.BASE_URL_PROP))) { + if (baseUrl.toString().startsWith( + replica.getStr(ZkStateReader.BASE_URL_PROP))) { stoppedJetty = jetty; ChaosMonkey.stop(jetty); replica1 = replica; @@ -87,47 +92,48 @@ public class DeleteInactiveReplicaTest extends DeleteReplicaTest{ } } } - - /*final Slice shard1 = testcoll.getSlices().iterator().next(); - if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active"); - Replica replica1 = shard1.getReplicas().iterator().next(); - JettySolrRunner stoppedJetty = null; - StringBuilder sb = new StringBuilder(); - for (JettySolrRunner jetty : jettys) { - sb.append(jetty.getBaseUrl()).append(","); - if( jetty.getBaseUrl().toString().startsWith(replica1.getStr(ZkStateReader.BASE_URL_PROP)) ) { - stoppedJetty = jetty; - ChaosMonkey.stop(jetty); - stopped = true; - break; - } - }*/ - if(!stopped){ - fail("Could not find jetty to stop in collection "+ testcoll + " jettys: "+sb); + + /* + * final Slice shard1 = testcoll.getSlices().iterator().next(); + * if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active"); + * Replica replica1 = shard1.getReplicas().iterator().next(); + * JettySolrRunner stoppedJetty = null; StringBuilder sb = new + * StringBuilder(); for (JettySolrRunner jetty : jettys) { + * sb.append(jetty.getBaseUrl()).append(","); if( + * jetty.getBaseUrl().toString + * ().startsWith(replica1.getStr(ZkStateReader.BASE_URL_PROP)) ) { + * stoppedJetty = jetty; ChaosMonkey.stop(jetty); stopped = true; break; } } + */ + if (!stopped) { + fail("Could not find jetty to stop in collection " + testcoll + + " jettys: " + sb); } - - long endAt = System.currentTimeMillis()+3000; + + long endAt = System.currentTimeMillis() + 3000; boolean success = false; - while(System.currentTimeMillis() < endAt){ - testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME); - if(!"active".equals(testcoll.getSlice(shard1.getName()).getReplica(replica1.getName()).getStr(Slice.STATE)) ){ - success=true; + while (System.currentTimeMillis() < endAt) { + testcoll = getCommonCloudSolrServer().getZkStateReader() + .getClusterState().getCollection(collectionName); + if (!"active".equals(testcoll.getSlice(shard1.getName()) + .getReplica(replica1.getName()).getStr(Slice.STATE))) { + success = true; } - if(success) break; + if (success) break; Thread.sleep(100); } - log.info("removed_replicas {}/{} ",shard1.getName(),replica1.getName()); - removeAndWaitForReplicaGone(COLL_NAME, client, replica1, shard1.getName()); - + log.info("removed_replicas {}/{} ", shard1.getName(), replica1.getName()); + removeAndWaitForReplicaGone(collectionName, client, replica1, + shard1.getName()); + ChaosMonkey.start(stoppedJetty); log.info("restarted jetty"); - - - Map m = makeMap("qt","/admin/cores", - "action", "status"); - - NamedList resp = new HttpSolrServer(replica1.getStr("base_url")).request(new QueryRequest(new MapSolrParams(m))); - assertNull( "The core is up and running again" , ((NamedList)resp.get("status")).get(replica1.getStr("core"))); - + + Map m = makeMap("qt", "/admin/cores", "action", "status"); + + NamedList resp = new HttpSolrServer(replica1.getStr("base_url")) + .request(new QueryRequest(new MapSolrParams(m))); + assertNull("The core is up and running again", + ((NamedList) resp.get("status")).get(replica1.getStr("core"))); + } } diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java index e28f673eb02..c9f63a53212 100644 --- a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java @@ -27,18 +27,15 @@ import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Set; import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CloudSolrServer; -import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.params.MapSolrParams; -import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; import org.junit.After; import org.junit.Before; @@ -80,70 +77,63 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase { checkCreatedVsState = false; } - @Override - protected void setDistributedParams(ModifiableSolrParams params) { - - if (r.nextBoolean()) { - // don't set shards, let that be figured out from the cloud state - } else { - // use shard ids rather than physical locations - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < shardCount; i++) { - if (i > 0) - sb.append(','); - sb.append("shard" + (i + 3)); - } - params.set("shards", sb.toString()); - } - } - @Override public void doTest() throws Exception { deleteLiveReplicaTest(); -// deleteInactiveReplicaTest(); -// super.printLayout(); } - private void deleteLiveReplicaTest() throws Exception{ - String COLL_NAME = "delLiveColl"; + private void deleteLiveReplicaTest() throws Exception { + String collectionName = "delLiveColl"; CloudSolrServer client = createCloudClient(null); - createColl(COLL_NAME, client); - DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME); - - Slice shard1 = null; - Replica replica1 = null; - for (Slice slice : testcoll.getSlices()) { - if("active".equals( slice.getStr("state"))){ - shard1 = slice; - for (Replica replica : shard1.getReplicas()) if("active".equals(replica.getStr("state"))) replica1 =replica; + try { + createCollection(collectionName, client); + + waitForRecoveriesToFinish(collectionName, false); + + DocCollection testcoll = getCommonCloudSolrServer().getZkStateReader() + .getClusterState().getCollection(collectionName); + + Slice shard1 = null; + Replica replica1 = null; + for (Slice slice : testcoll.getSlices()) { + if ("active".equals(slice.getStr("state"))) { + shard1 = slice; + for (Replica replica : shard1.getReplicas()) + if ("active".equals(replica.getStr("state"))) replica1 = replica; + } } + // final Slice shard1 = testcoll.getSlices().iterator().next(); + // if(!shard1.getState().equals(Slice.ACTIVE)) + // fail("shard is not active"); + // for (Replica replica : shard1.getReplicas()) + // if("active".equals(replica.getStr("state"))) replica1 =replica; + if (replica1 == null) fail("no active replicas found"); + removeAndWaitForReplicaGone(collectionName, client, replica1, + shard1.getName()); + } finally { + client.shutdown(); } -// final Slice shard1 = testcoll.getSlices().iterator().next(); -// if(!shard1.getState().equals(Slice.ACTIVE)) fail("shard is not active"); -// for (Replica replica : shard1.getReplicas()) if("active".equals(replica.getStr("state"))) replica1 =replica; - if(replica1 == null) fail("no active replicas found"); - Thread.sleep(2500);//remove this later.not sure if the clusterstate is not propagated and that is why the tests are failing.SOLR-5437 - removeAndWaitForReplicaGone(COLL_NAME, client, replica1, shard1.getName()); - client.shutdown(); } - protected void removeAndWaitForReplicaGone(String COLL_NAME, CloudSolrServer client, Replica replica, String shard) throws SolrServerException, IOException, InterruptedException { - Map m = makeMap("collection", COLL_NAME, - "action", DELETEREPLICA, - "shard",shard, - "replica",replica.getName()); - SolrParams params = new MapSolrParams( m); + protected void removeAndWaitForReplicaGone(String COLL_NAME, + CloudSolrServer client, Replica replica, String shard) + throws SolrServerException, IOException, InterruptedException { + Map m = makeMap("collection", COLL_NAME, "action", DELETEREPLICA, "shard", + shard, "replica", replica.getName()); + SolrParams params = new MapSolrParams(m); SolrRequest request = new QueryRequest(params); request.setPath("/admin/collections"); client.request(request); - long endAt = System.currentTimeMillis()+3000; + long endAt = System.currentTimeMillis() + 3000; boolean success = false; DocCollection testcoll = null; - while(System.currentTimeMillis() < endAt){ - testcoll = getCommonCloudSolrServer().getZkStateReader().getClusterState().getCollection(COLL_NAME); + while (System.currentTimeMillis() < endAt) { + testcoll = getCommonCloudSolrServer().getZkStateReader() + .getClusterState().getCollection(COLL_NAME); success = testcoll.getSlice(shard).getReplica(replica.getName()) == null; - if(success) { - log.info("replica cleaned up {}/{} core {}",shard+"/"+replica.getName(), replica.getStr("core")); + if (success) { + log.info("replica cleaned up {}/{} core {}", + shard + "/" + replica.getName(), replica.getStr("core")); log.info("current state {}", testcoll); break; } @@ -152,7 +142,7 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase { assertTrue("Replica not cleaned up", success); } - protected void createColl(String COLL_NAME, CloudSolrServer client) throws Exception { + protected void createCollection(String COLL_NAME, CloudSolrServer client) throws Exception { int replicationFactor = 2; int numShards = 2; int maxShardsPerNode = ((((numShards+1) * replicationFactor) / getCommonCloudSolrServer() @@ -164,15 +154,5 @@ public class DeleteReplicaTest extends AbstractFullDistribZkTestBase { NUM_SLICES, numShards); Map> collectionInfos = new HashMap>(); createCollection(collectionInfos, COLL_NAME, props, client); - Set>> collectionInfosEntrySet = collectionInfos.entrySet(); - for (Map.Entry> entry : collectionInfosEntrySet) { - String collection = entry.getKey(); - List list = entry.getValue(); - checkForCollection(collection, list, null); - String url = getUrlFromZk(getCommonCloudSolrServer().getZkStateReader().getClusterState(), collection); - HttpSolrServer collectionClient = new HttpSolrServer(url); - // poll for a second - it can take a moment before we are ready to serve - waitForNon403or404or503(collectionClient); - } } }