From bd283c895f6e81f97608c0f12fec22f6fcf8e89d Mon Sep 17 00:00:00 2001 From: Cao Manh Dat Date: Thu, 20 Jul 2017 14:39:30 +0700 Subject: [PATCH] SOLR-11124: MoveReplicaCmd should skip deleting old replica in case of its node is not live --- solr/CHANGES.txt | 2 + .../org/apache/solr/cloud/MoveReplicaCmd.java | 64 ++++++++++--------- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index dfbc01074c5..90ab2924721 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -79,6 +79,8 @@ Optimizations * SOLR-10985: Remove unnecessary toString() calls in solr-core's search package's debug logging. (Michael Braun via Christine Poerschke) +* SOLR-11124: MoveReplicaCmd should skip deleting old replica in case of its node is not live (Cao Manh Dat) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java b/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java index 8c4e9d76ee5..3f29fc5f69c 100644 --- a/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/MoveReplicaCmd.java @@ -119,36 +119,41 @@ public class MoveReplicaCmd implements Cmd{ private void moveHdfsReplica(ClusterState clusterState, NamedList results, String dataDir, String targetNode, String async, DocCollection coll, Replica replica, Slice slice, int timeout) throws Exception { - ZkNodeProps removeReplicasProps = new ZkNodeProps( - COLLECTION_PROP, coll.getName(), - SHARD_ID_PROP, slice.getName(), - REPLICA_PROP, replica.getName() - ); - removeReplicasProps.getProperties().put(CoreAdminParams.DELETE_DATA_DIR, false); - removeReplicasProps.getProperties().put(CoreAdminParams.DELETE_INDEX, false); - if(async!=null) removeReplicasProps.getProperties().put(ASYNC, async); - NamedList deleteResult = new NamedList(); - ocmh.deleteReplica(clusterState, removeReplicasProps, deleteResult, null); - if (deleteResult.get("failure") != null) { - String errorString = String.format(Locale.ROOT, "Failed to cleanup replica collection=%s shard=%s name=%s", - coll.getName(), slice.getName(), replica.getName()); - log.warn(errorString); - results.add("failure", errorString + ", because of : " + deleteResult.get("failure")); - return; - } - - TimeOut timeOut = new TimeOut(20L, TimeUnit.SECONDS); - while (!timeOut.hasTimedOut()) { - coll = ocmh.zkStateReader.getClusterState().getCollection(coll.getName()); - if (coll.getReplica(replica.getName()) != null) { - Thread.sleep(100); - } else { - break; + String skipCreateReplicaInClusterState = "true"; + if (clusterState.getLiveNodes().contains(replica.getNodeName())) { + skipCreateReplicaInClusterState = "false"; + ZkNodeProps removeReplicasProps = new ZkNodeProps( + COLLECTION_PROP, coll.getName(), + SHARD_ID_PROP, slice.getName(), + REPLICA_PROP, replica.getName() + ); + removeReplicasProps.getProperties().put(CoreAdminParams.DELETE_DATA_DIR, false); + removeReplicasProps.getProperties().put(CoreAdminParams.DELETE_INDEX, false); + if(async!=null) removeReplicasProps.getProperties().put(ASYNC, async); + NamedList deleteResult = new NamedList(); + ocmh.deleteReplica(clusterState, removeReplicasProps, deleteResult, null); + if (deleteResult.get("failure") != null) { + String errorString = String.format(Locale.ROOT, "Failed to cleanup replica collection=%s shard=%s name=%s", + coll.getName(), slice.getName(), replica.getName()); + log.warn(errorString); + results.add("failure", errorString + ", because of : " + deleteResult.get("failure")); + return; } - } - if (timeOut.hasTimedOut()) { - results.add("failure", "Still see deleted replica in clusterstate!"); - return; + + TimeOut timeOut = new TimeOut(20L, TimeUnit.SECONDS); + while (!timeOut.hasTimedOut()) { + coll = ocmh.zkStateReader.getClusterState().getCollection(coll.getName()); + if (coll.getReplica(replica.getName()) != null) { + Thread.sleep(100); + } else { + break; + } + } + if (timeOut.hasTimedOut()) { + results.add("failure", "Still see deleted replica in clusterstate!"); + return; + } + } String ulogDir = replica.getStr(CoreAdminParams.ULOG_DIR); @@ -158,6 +163,7 @@ public class MoveReplicaCmd implements Cmd{ CoreAdminParams.NODE, targetNode, CoreAdminParams.CORE_NODE_NAME, replica.getName(), CoreAdminParams.NAME, replica.getCoreName(), + SKIP_CREATE_REPLICA_IN_CLUSTER_STATE, skipCreateReplicaInClusterState, CoreAdminParams.ULOG_DIR, ulogDir.substring(0, ulogDir.lastIndexOf(UpdateLog.TLOG_NAME)), CoreAdminParams.DATA_DIR, dataDir); if(async!=null) addReplicasProps.getProperties().put(ASYNC, async);