SOLR-12066: Cleanup deleted core when node start

This commit is contained in:
Cao Manh Dat 2018-03-30 20:11:39 +07:00
parent d483108a15
commit 35bfe89790
4 changed files with 59 additions and 5 deletions

View File

@ -110,6 +110,8 @@ Optimizations
* SOLR-12146: LIR should skip deleted replicas (Cao Manh Dat)
* SOLR-12066: Cleanup deleted core when node start (Cao Manh Dat)
Other Changes
----------------------

View File

@ -1661,6 +1661,9 @@ public class ZkController {
Thread.currentThread().interrupt();
log.error("", e);
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
} catch (NotInClusterStateException e) {
// make the stack trace less verbose
throw e;
} catch (Exception e) {
log.error("", e);
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", e);
@ -1688,7 +1691,7 @@ public class ZkController {
return true;
}
private void checkStateInZk(CoreDescriptor cd) throws InterruptedException {
private void checkStateInZk(CoreDescriptor cd) throws InterruptedException, NotInClusterStateException {
if (!Overseer.isLegacy(zkStateReader)) {
CloudDescriptor cloudDesc = cd.getCloudDescriptor();
String nodeName = cloudDesc.getCoreNodeName();
@ -1722,7 +1725,8 @@ public class ZkController {
}
Replica replica = slice.getReplica(coreNodeName);
if (replica == null) {
errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId());
errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
", ignore the exception if the replica was deleted");
return false;
}
return true;
@ -1730,8 +1734,9 @@ public class ZkController {
} catch (TimeoutException e) {
String error = errorMessage.get();
if (error == null)
error = "Replica " + coreNodeName + " is not present in cluster state";
throw new SolrException(ErrorCode.SERVER_ERROR, error + ": " + collectionState.get());
error = "coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
", ignore the exception if the replica was deleted";
throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
}
}
}
@ -2711,6 +2716,15 @@ public class ZkController {
}
}
/**
* Thrown during pre register process if the replica is not present in clusterstate
*/
public static class NotInClusterStateException extends SolrException {
public NotInClusterStateException(ErrorCode code, String msg) {
super(code, msg);
}
}
public boolean checkIfCoreNodeNameAlreadyExists(CoreDescriptor dcore) {
DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(dcore.getCollectionName());
if (collection != null) {

View File

@ -677,7 +677,7 @@ public class CoreContainer {
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} catch (ExecutionException e) {
log.error("Error waiting for SolrCore to be created", e);
log.error("Error waiting for SolrCore to be loaded on startup", e.getCause());
}
}
} finally {
@ -1063,6 +1063,11 @@ public class CoreContainer {
return core;
} catch (Exception e) {
coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e));
if (e instanceof ZkController.NotInClusterStateException && !newCollection) {
// this mostly happen when the core is deleted when this node is down
unload(dcore.getName(), true, true, true);
throw e;
}
solrCores.removeCoreDescriptor(dcore);
final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e);
if(core != null && !core.isClosed())

View File

@ -41,7 +41,10 @@ import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.TimeSource;
import org.apache.solr.common.util.Utils;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.ZkContainer;
import org.apache.solr.util.FileUtils;
import org.apache.solr.util.TimeOut;
import org.apache.zookeeper.KeeperException;
import org.junit.BeforeClass;
@ -152,6 +155,36 @@ public class DeleteReplicaTest extends SolrCloudTestCase {
}
@Test
public void deleteReplicaOnDownNode() throws Exception {
final String collectionName = "deleteReplicaOnDownNode";
CollectionAdminRequest.createCollection(collectionName, "conf", 1, 2).process(cluster.getSolrClient());
waitForState("Expected one shards with two replicas", collectionName, clusterShape(1, 2));
Slice shard = getCollectionState(collectionName).getSlice("shard1");
Replica replica = shard.getReplicas(rep -> !rep.getName().equals(shard.getLeader().getName())).get(0);
JettySolrRunner replicaJetty = getJettyForReplica(replica);
CoreDescriptor replicaCd;
try (SolrCore core = replicaJetty.getCoreContainer().getCore(replica.getCoreName())) {
replicaCd = core.getCoreDescriptor();
}
assertNotNull("Expected core descriptor of "+ replica.getName() + " is not null",replicaCd);
String replicaJettyNodeName = replicaJetty.getNodeName();
// shutdown node of a replica
replicaJetty.stop();
waitForNodeLeave(replicaJettyNodeName);
waitForState("Expected one shards with one replica", collectionName, clusterShape(1, 1));
CollectionAdminRequest.deleteReplica(collectionName, shard.getName(), replica.getName()).process(cluster.getSolrClient());
waitForState("Expected only one replica left", collectionName, (liveNodes, collectionState) -> collectionState.getReplicas().size() == 1);
// restart the test and make sure the data get deleted
replicaJetty.start();
TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS, TimeSource.NANO_TIME);
timeOut.waitFor("Expected data dir and instance dir of " + replica.getName() + " is deleted", ()
-> !Files.exists(replicaCd.getInstanceDir()) && !FileUtils.fileExists(replicaCd.getDataDir()));
}
@Test
public void deleteReplicaByCountForAllShards() throws Exception {