SOLR-11124: MoveReplicaCmd should skip deleting old replica in case of its node is not live

This commit is contained in:
Cao Manh Dat 2017-07-20 14:39:30 +07:00
parent 227eeefcd0
commit bd283c895f
2 changed files with 37 additions and 29 deletions

View File

@ -79,6 +79,8 @@ Optimizations
* SOLR-10985: Remove unnecessary toString() calls in solr-core's search package's debug logging.
(Michael Braun via Christine Poerschke)
* SOLR-11124: MoveReplicaCmd should skip deleting old replica in case of its node is not live (Cao Manh Dat)
Other Changes
----------------------

View File

@ -119,36 +119,41 @@ public class MoveReplicaCmd implements Cmd{
private void moveHdfsReplica(ClusterState clusterState, NamedList results, String dataDir, String targetNode, String async,
DocCollection coll, Replica replica, Slice slice, int timeout) throws Exception {
ZkNodeProps removeReplicasProps = new ZkNodeProps(
COLLECTION_PROP, coll.getName(),
SHARD_ID_PROP, slice.getName(),
REPLICA_PROP, replica.getName()
);
removeReplicasProps.getProperties().put(CoreAdminParams.DELETE_DATA_DIR, false);
removeReplicasProps.getProperties().put(CoreAdminParams.DELETE_INDEX, false);
if(async!=null) removeReplicasProps.getProperties().put(ASYNC, async);
NamedList deleteResult = new NamedList();
ocmh.deleteReplica(clusterState, removeReplicasProps, deleteResult, null);
if (deleteResult.get("failure") != null) {
String errorString = String.format(Locale.ROOT, "Failed to cleanup replica collection=%s shard=%s name=%s",
coll.getName(), slice.getName(), replica.getName());
log.warn(errorString);
results.add("failure", errorString + ", because of : " + deleteResult.get("failure"));
return;
}
TimeOut timeOut = new TimeOut(20L, TimeUnit.SECONDS);
while (!timeOut.hasTimedOut()) {
coll = ocmh.zkStateReader.getClusterState().getCollection(coll.getName());
if (coll.getReplica(replica.getName()) != null) {
Thread.sleep(100);
} else {
break;
String skipCreateReplicaInClusterState = "true";
if (clusterState.getLiveNodes().contains(replica.getNodeName())) {
skipCreateReplicaInClusterState = "false";
ZkNodeProps removeReplicasProps = new ZkNodeProps(
COLLECTION_PROP, coll.getName(),
SHARD_ID_PROP, slice.getName(),
REPLICA_PROP, replica.getName()
);
removeReplicasProps.getProperties().put(CoreAdminParams.DELETE_DATA_DIR, false);
removeReplicasProps.getProperties().put(CoreAdminParams.DELETE_INDEX, false);
if(async!=null) removeReplicasProps.getProperties().put(ASYNC, async);
NamedList deleteResult = new NamedList();
ocmh.deleteReplica(clusterState, removeReplicasProps, deleteResult, null);
if (deleteResult.get("failure") != null) {
String errorString = String.format(Locale.ROOT, "Failed to cleanup replica collection=%s shard=%s name=%s",
coll.getName(), slice.getName(), replica.getName());
log.warn(errorString);
results.add("failure", errorString + ", because of : " + deleteResult.get("failure"));
return;
}
}
if (timeOut.hasTimedOut()) {
results.add("failure", "Still see deleted replica in clusterstate!");
return;
TimeOut timeOut = new TimeOut(20L, TimeUnit.SECONDS);
while (!timeOut.hasTimedOut()) {
coll = ocmh.zkStateReader.getClusterState().getCollection(coll.getName());
if (coll.getReplica(replica.getName()) != null) {
Thread.sleep(100);
} else {
break;
}
}
if (timeOut.hasTimedOut()) {
results.add("failure", "Still see deleted replica in clusterstate!");
return;
}
}
String ulogDir = replica.getStr(CoreAdminParams.ULOG_DIR);
@ -158,6 +163,7 @@ public class MoveReplicaCmd implements Cmd{
CoreAdminParams.NODE, targetNode,
CoreAdminParams.CORE_NODE_NAME, replica.getName(),
CoreAdminParams.NAME, replica.getCoreName(),
SKIP_CREATE_REPLICA_IN_CLUSTER_STATE, skipCreateReplicaInClusterState,
CoreAdminParams.ULOG_DIR, ulogDir.substring(0, ulogDir.lastIndexOf(UpdateLog.TLOG_NAME)),
CoreAdminParams.DATA_DIR, dataDir);
if(async!=null) addReplicasProps.getProperties().put(ASYNC, async);