mirror of https://github.com/apache/lucene.git
SOLR-12066: Cleanup deleted core when node start
This commit is contained in:
parent
d483108a15
commit
35bfe89790
|
@ -110,6 +110,8 @@ Optimizations
|
||||||
|
|
||||||
* SOLR-12146: LIR should skip deleted replicas (Cao Manh Dat)
|
* SOLR-12146: LIR should skip deleted replicas (Cao Manh Dat)
|
||||||
|
|
||||||
|
* SOLR-12066: Cleanup deleted core when node start (Cao Manh Dat)
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -1661,6 +1661,9 @@ public class ZkController {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
log.error("", e);
|
log.error("", e);
|
||||||
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
|
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
|
||||||
|
} catch (NotInClusterStateException e) {
|
||||||
|
// make the stack trace less verbose
|
||||||
|
throw e;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("", e);
|
log.error("", e);
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", e);
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", e);
|
||||||
|
@ -1688,7 +1691,7 @@ public class ZkController {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkStateInZk(CoreDescriptor cd) throws InterruptedException {
|
private void checkStateInZk(CoreDescriptor cd) throws InterruptedException, NotInClusterStateException {
|
||||||
if (!Overseer.isLegacy(zkStateReader)) {
|
if (!Overseer.isLegacy(zkStateReader)) {
|
||||||
CloudDescriptor cloudDesc = cd.getCloudDescriptor();
|
CloudDescriptor cloudDesc = cd.getCloudDescriptor();
|
||||||
String nodeName = cloudDesc.getCoreNodeName();
|
String nodeName = cloudDesc.getCoreNodeName();
|
||||||
|
@ -1722,7 +1725,8 @@ public class ZkController {
|
||||||
}
|
}
|
||||||
Replica replica = slice.getReplica(coreNodeName);
|
Replica replica = slice.getReplica(coreNodeName);
|
||||||
if (replica == null) {
|
if (replica == null) {
|
||||||
errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId());
|
errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
|
||||||
|
", ignore the exception if the replica was deleted");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
@ -1730,8 +1734,9 @@ public class ZkController {
|
||||||
} catch (TimeoutException e) {
|
} catch (TimeoutException e) {
|
||||||
String error = errorMessage.get();
|
String error = errorMessage.get();
|
||||||
if (error == null)
|
if (error == null)
|
||||||
error = "Replica " + coreNodeName + " is not present in cluster state";
|
error = "coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
|
||||||
throw new SolrException(ErrorCode.SERVER_ERROR, error + ": " + collectionState.get());
|
", ignore the exception if the replica was deleted";
|
||||||
|
throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2711,6 +2716,15 @@ public class ZkController {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Thrown during pre register process if the replica is not present in clusterstate
|
||||||
|
*/
|
||||||
|
public static class NotInClusterStateException extends SolrException {
|
||||||
|
public NotInClusterStateException(ErrorCode code, String msg) {
|
||||||
|
super(code, msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public boolean checkIfCoreNodeNameAlreadyExists(CoreDescriptor dcore) {
|
public boolean checkIfCoreNodeNameAlreadyExists(CoreDescriptor dcore) {
|
||||||
DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(dcore.getCollectionName());
|
DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(dcore.getCollectionName());
|
||||||
if (collection != null) {
|
if (collection != null) {
|
||||||
|
|
|
@ -677,7 +677,7 @@ public class CoreContainer {
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
} catch (ExecutionException e) {
|
} catch (ExecutionException e) {
|
||||||
log.error("Error waiting for SolrCore to be created", e);
|
log.error("Error waiting for SolrCore to be loaded on startup", e.getCause());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -1063,6 +1063,11 @@ public class CoreContainer {
|
||||||
return core;
|
return core;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e));
|
coreInitFailures.put(dcore.getName(), new CoreLoadFailure(dcore, e));
|
||||||
|
if (e instanceof ZkController.NotInClusterStateException && !newCollection) {
|
||||||
|
// this mostly happen when the core is deleted when this node is down
|
||||||
|
unload(dcore.getName(), true, true, true);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
solrCores.removeCoreDescriptor(dcore);
|
solrCores.removeCoreDescriptor(dcore);
|
||||||
final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e);
|
final SolrException solrException = new SolrException(ErrorCode.SERVER_ERROR, "Unable to create core [" + dcore.getName() + "]", e);
|
||||||
if(core != null && !core.isClosed())
|
if(core != null && !core.isClosed())
|
||||||
|
|
|
@ -41,7 +41,10 @@ import org.apache.solr.common.cloud.ZkNodeProps;
|
||||||
import org.apache.solr.common.cloud.ZkStateReader;
|
import org.apache.solr.common.cloud.ZkStateReader;
|
||||||
import org.apache.solr.common.util.TimeSource;
|
import org.apache.solr.common.util.TimeSource;
|
||||||
import org.apache.solr.common.util.Utils;
|
import org.apache.solr.common.util.Utils;
|
||||||
|
import org.apache.solr.core.CoreDescriptor;
|
||||||
|
import org.apache.solr.core.SolrCore;
|
||||||
import org.apache.solr.core.ZkContainer;
|
import org.apache.solr.core.ZkContainer;
|
||||||
|
import org.apache.solr.util.FileUtils;
|
||||||
import org.apache.solr.util.TimeOut;
|
import org.apache.solr.util.TimeOut;
|
||||||
import org.apache.zookeeper.KeeperException;
|
import org.apache.zookeeper.KeeperException;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
|
@ -152,6 +155,36 @@ public class DeleteReplicaTest extends SolrCloudTestCase {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void deleteReplicaOnDownNode() throws Exception {
|
||||||
|
final String collectionName = "deleteReplicaOnDownNode";
|
||||||
|
CollectionAdminRequest.createCollection(collectionName, "conf", 1, 2).process(cluster.getSolrClient());
|
||||||
|
waitForState("Expected one shards with two replicas", collectionName, clusterShape(1, 2));
|
||||||
|
|
||||||
|
Slice shard = getCollectionState(collectionName).getSlice("shard1");
|
||||||
|
Replica replica = shard.getReplicas(rep -> !rep.getName().equals(shard.getLeader().getName())).get(0);
|
||||||
|
JettySolrRunner replicaJetty = getJettyForReplica(replica);
|
||||||
|
CoreDescriptor replicaCd;
|
||||||
|
try (SolrCore core = replicaJetty.getCoreContainer().getCore(replica.getCoreName())) {
|
||||||
|
replicaCd = core.getCoreDescriptor();
|
||||||
|
}
|
||||||
|
assertNotNull("Expected core descriptor of "+ replica.getName() + " is not null",replicaCd);
|
||||||
|
String replicaJettyNodeName = replicaJetty.getNodeName();
|
||||||
|
|
||||||
|
// shutdown node of a replica
|
||||||
|
replicaJetty.stop();
|
||||||
|
waitForNodeLeave(replicaJettyNodeName);
|
||||||
|
waitForState("Expected one shards with one replica", collectionName, clusterShape(1, 1));
|
||||||
|
CollectionAdminRequest.deleteReplica(collectionName, shard.getName(), replica.getName()).process(cluster.getSolrClient());
|
||||||
|
waitForState("Expected only one replica left", collectionName, (liveNodes, collectionState) -> collectionState.getReplicas().size() == 1);
|
||||||
|
|
||||||
|
// restart the test and make sure the data get deleted
|
||||||
|
replicaJetty.start();
|
||||||
|
TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS, TimeSource.NANO_TIME);
|
||||||
|
timeOut.waitFor("Expected data dir and instance dir of " + replica.getName() + " is deleted", ()
|
||||||
|
-> !Files.exists(replicaCd.getInstanceDir()) && !FileUtils.fileExists(replicaCd.getDataDir()));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void deleteReplicaByCountForAllShards() throws Exception {
|
public void deleteReplicaByCountForAllShards() throws Exception {
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue