mirror of https://github.com/apache/lucene.git
SOLR-12066: Cleanup deleted core when node start
This commit is contained in:
parent
d483108a15
commit
35bfe89790
|
@ -110,6 +110,8 @@ Optimizations
|
|||
|
||||
* SOLR-12146: LIR should skip deleted replicas (Cao Manh Dat)
|
||||
|
||||
* SOLR-12066: Cleanup deleted core when node start (Cao Manh Dat)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -1661,6 +1661,9 @@ public class ZkController {
|
|||
Thread.currentThread().interrupt();
|
||||
log.error("", e);
|
||||
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
|
||||
} catch (NotInClusterStateException e) {
|
||||
// make the stack trace less verbose
|
||||
throw e;
|
||||
} catch (Exception e) {
|
||||
log.error("", e);
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", e);
|
||||
|
@ -1688,7 +1691,7 @@ public class ZkController {
|
|||
return true;
|
||||
}
|
||||
|
||||
private void checkStateInZk(CoreDescriptor cd) throws InterruptedException {
|
||||
private void checkStateInZk(CoreDescriptor cd) throws InterruptedException, NotInClusterStateException {
|
||||
if (!Overseer.isLegacy(zkStateReader)) {
|
||||
CloudDescriptor cloudDesc = cd.getCloudDescriptor();
|
||||
String nodeName = cloudDesc.getCoreNodeName();
|
||||
|
@ -1722,7 +1725,8 @@ public class ZkController {
|
|||
}
|
||||
Replica replica = slice.getReplica(coreNodeName);
|
||||
if (replica == null) {
|
||||
errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId());
|
||||
errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
|
||||
", ignore the exception if the replica was deleted");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -1730,8 +1734,9 @@ public class ZkController {
|
|||
} catch (TimeoutException e) {
|
||||
String error = errorMessage.get();
|
||||
if (error == null)
|
||||
error = "Replica " + coreNodeName + " is not present in cluster state";
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, error + ": " + collectionState.get());
|
||||
error = "coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
|
||||
", ignore the exception if the replica was deleted";
|
||||
throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2711,6 +2716,15 @@ public class ZkController {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Thrown during pre register process if the replica is not present in clusterstate
|
||||
*/
|
||||
public static class NotInClusterStateException extends SolrException {
|
||||
public NotInClusterStateException(ErrorCode code, String msg) {
|
||||
super(code, msg);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean checkIfCoreNodeNameAlreadyExists(CoreDescriptor dcore) {
|
||||
DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(dcore.getCollectionName());
|
||||
if (collection != null) {
|
||||
|
|
|
@ -677,7 +677,7 @@ public class CoreContainer {
|
|||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
} catch (ExecutionException e) {
|
||||
log.error("Error waiting for SolrCore to be created", e);
|
||||
log.error("Error waiting for SolrCore to be loaded on startup", e.getCause());
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
|
@ -1063,6 +1063,11 @@ public class CoreContainer {
|
|||
return core;
|
||||
} catch (Exception e) {
|
||||
coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e));
|
||||
if (e instanceof ZkController.NotInClusterStateException && !newCollection) {
|
||||
// this mostly happen when the core is deleted when this node is down
|
||||
unload(dcore.getName(), true, true, true);
|
||||
throw e;
|
||||
}
|
||||
solrCores.removeCoreDescriptor(dcore);
|
||||
final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e);
|
||||
if(core != null && !core.isClosed())
|
||||
|
|
|
@ -41,7 +41,10 @@ import org.apache.solr.common.cloud.ZkNodeProps;
|
|||
import org.apache.solr.common.cloud.ZkStateReader;
|
||||
import org.apache.solr.common.util.TimeSource;
|
||||
import org.apache.solr.common.util.Utils;
|
||||
import org.apache.solr.core.CoreDescriptor;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.ZkContainer;
|
||||
import org.apache.solr.util.FileUtils;
|
||||
import org.apache.solr.util.TimeOut;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.junit.BeforeClass;
|
||||
|
@ -152,6 +155,36 @@ public class DeleteReplicaTest extends SolrCloudTestCase {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void deleteReplicaOnDownNode() throws Exception {
|
||||
final String collectionName = "deleteReplicaOnDownNode";
|
||||
CollectionAdminRequest.createCollection(collectionName, "conf", 1, 2).process(cluster.getSolrClient());
|
||||
waitForState("Expected one shards with two replicas", collectionName, clusterShape(1, 2));
|
||||
|
||||
Slice shard = getCollectionState(collectionName).getSlice("shard1");
|
||||
Replica replica = shard.getReplicas(rep -> !rep.getName().equals(shard.getLeader().getName())).get(0);
|
||||
JettySolrRunner replicaJetty = getJettyForReplica(replica);
|
||||
CoreDescriptor replicaCd;
|
||||
try (SolrCore core = replicaJetty.getCoreContainer().getCore(replica.getCoreName())) {
|
||||
replicaCd = core.getCoreDescriptor();
|
||||
}
|
||||
assertNotNull("Expected core descriptor of "+ replica.getName() + " is not null",replicaCd);
|
||||
String replicaJettyNodeName = replicaJetty.getNodeName();
|
||||
|
||||
// shutdown node of a replica
|
||||
replicaJetty.stop();
|
||||
waitForNodeLeave(replicaJettyNodeName);
|
||||
waitForState("Expected one shards with one replica", collectionName, clusterShape(1, 1));
|
||||
CollectionAdminRequest.deleteReplica(collectionName, shard.getName(), replica.getName()).process(cluster.getSolrClient());
|
||||
waitForState("Expected only one replica left", collectionName, (liveNodes, collectionState) -> collectionState.getReplicas().size() == 1);
|
||||
|
||||
// restart the test and make sure the data get deleted
|
||||
replicaJetty.start();
|
||||
TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS, TimeSource.NANO_TIME);
|
||||
timeOut.waitFor("Expected data dir and instance dir of " + replica.getName() + " is deleted", ()
|
||||
-> !Files.exists(replicaCd.getInstanceDir()) && !FileUtils.fileExists(replicaCd.getDataDir()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void deleteReplicaByCountForAllShards() throws Exception {
|
||||
|
||||
|
|
Loading…
Reference in New Issue