From 35bfe897901f1b51bce654b49aecd9560bfa797f Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Fri, 30 Mar 2018 20:11:39 +0700 Subject: [PATCH] SOLR-12066: Cleanup deleted core when node start --- solr/CHANGES.txt | 2 ++ .../org/apache/solr/cloud/ZkController.java | 22 ++++++++++--- .../org/apache/solr/core/CoreContainer.java | 7 +++- .../apache/solr/cloud/DeleteReplicaTest.java | 33 +++++++++++++++++++ 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 5854e0ff7fb..12bc25a6165 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -110,6 +110,8 @@ Optimizations * SOLR-12146: LIR should skip deleted replicas (Cao Manh Dat) +* SOLR-12066: Cleanup deleted core when node start (Cao Manh Dat) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java index c0ddd260adf..872a8b9d7e1 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java +++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java @@ -1661,6 +1661,9 @@ public class ZkController { Thread.currentThread().interrupt(); log.error("", e); throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e); + } catch (NotInClusterStateException e) { + // make the stack trace less verbose + throw e; } catch (Exception e) { log.error("", e); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", e); @@ -1688,7 +1691,7 @@ public class ZkController { return true; } - private void checkStateInZk(CoreDescriptor cd) throws InterruptedException { + private void checkStateInZk(CoreDescriptor cd) throws InterruptedException, NotInClusterStateException { if (!Overseer.isLegacy(zkStateReader)) { CloudDescriptor cloudDesc = cd.getCloudDescriptor(); String nodeName = cloudDesc.getCoreNodeName(); @@ -1722,7 +1725,8 @@ public class ZkController { } Replica replica = slice.getReplica(coreNodeName); if (replica == null) { - errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId()); + errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() + + ", ignore the exception if the replica was deleted"); return false; } return true; @@ -1730,8 +1734,9 @@ public class ZkController { } catch (TimeoutException e) { String error = errorMessage.get(); if (error == null) - error = "Replica " + coreNodeName + " is not present in cluster state"; - throw new SolrException(ErrorCode.SERVER_ERROR, error + ": " + collectionState.get()); + error = "coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() + + ", ignore the exception if the replica was deleted"; + throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error); } } } @@ -2711,6 +2716,15 @@ public class ZkController { } } + /** + * Thrown during pre register process if the replica is not present in clusterstate + */ + public static class NotInClusterStateException extends SolrException { + public NotInClusterStateException(ErrorCode code, String msg) { + super(code, msg); + } + } + public boolean checkIfCoreNodeNameAlreadyExists(CoreDescriptor dcore) { DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(dcore.getCollectionName()); if (collection != null) { diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index b667bc06209..74b718cdca5 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -677,7 +677,7 @@ public class CoreContainer { } catch (InterruptedException e) { Thread.currentThread().interrupt(); } catch (ExecutionException e) { - log.error("Error waiting for SolrCore to be created", e); + log.error("Error waiting for SolrCore to be loaded on startup", e.getCause()); } } } finally { @@ -1063,6 +1063,11 @@ public class CoreContainer { return core; } catch (Exception e) { coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e)); + if (e instanceof ZkController.NotInClusterStateException && !newCollection) { + // this mostly happen when the core is deleted when this node is down + unload(dcore.getName(), true, true, true); + throw e; + } solrCores.removeCoreDescriptor(dcore); final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e); if(core != null && !core.isClosed()) diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java index 3208ebd5dc6..1a021d7d8f3 100644 --- a/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/DeleteReplicaTest.java @@ -41,7 +41,10 @@ import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.util.TimeSource; import org.apache.solr.common.util.Utils; +import org.apache.solr.core.CoreDescriptor; +import org.apache.solr.core.SolrCore; import org.apache.solr.core.ZkContainer; +import org.apache.solr.util.FileUtils; import org.apache.solr.util.TimeOut; import org.apache.zookeeper.KeeperException; import org.junit.BeforeClass; @@ -152,6 +155,36 @@ public class DeleteReplicaTest extends SolrCloudTestCase { } + @Test + public void deleteReplicaOnDownNode() throws Exception { + final String collectionName = "deleteReplicaOnDownNode"; + CollectionAdminRequest.createCollection(collectionName, "conf", 1, 2).process(cluster.getSolrClient()); + waitForState("Expected one shards with two replicas", collectionName, clusterShape(1, 2)); + + Slice shard = getCollectionState(collectionName).getSlice("shard1"); + Replica replica = shard.getReplicas(rep -> !rep.getName().equals(shard.getLeader().getName())).get(0); + JettySolrRunner replicaJetty = getJettyForReplica(replica); + CoreDescriptor replicaCd; + try (SolrCore core = replicaJetty.getCoreContainer().getCore(replica.getCoreName())) { + replicaCd = core.getCoreDescriptor(); + } + assertNotNull("Expected core descriptor of "+ replica.getName() + " is not null",replicaCd); + String replicaJettyNodeName = replicaJetty.getNodeName(); + + // shutdown node of a replica + replicaJetty.stop(); + waitForNodeLeave(replicaJettyNodeName); + waitForState("Expected one shards with one replica", collectionName, clusterShape(1, 1)); + CollectionAdminRequest.deleteReplica(collectionName, shard.getName(), replica.getName()).process(cluster.getSolrClient()); + waitForState("Expected only one replica left", collectionName, (liveNodes, collectionState) -> collectionState.getReplicas().size() == 1); + + // restart the test and make sure the data get deleted + replicaJetty.start(); + TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS, TimeSource.NANO_TIME); + timeOut.waitFor("Expected data dir and instance dir of " + replica.getName() + " is deleted", () + -> !Files.exists(replicaCd.getInstanceDir()) && !FileUtils.fileExists(replicaCd.getDataDir())); + } + @Test public void deleteReplicaByCountForAllShards() throws Exception {