From 247142e18ceaff5a87cd4fd9c6ad509cf86edc07 Mon Sep 17 00:00:00 2001 From: Shalin Shekhar Mangar Date: Thu, 12 Sep 2013 07:14:10 +0000 Subject: [PATCH] SOLR-5233: The deleteshard collections API doesn't wait for cluster state to update, can fail if some nodes of the deleted shard were down and had incorrect logging. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1522463 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 4 ++++ .../core/src/java/org/apache/solr/cloud/Overseer.java | 3 ++- .../solr/cloud/OverseerCollectionProcessor.java | 11 ++++++----- .../apache/solr/handler/admin/CollectionsHandler.java | 6 +++--- .../test/org/apache/solr/cloud/DeleteShardTest.java | 8 ++++---- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 49aeaff0b20..6bcee8751da 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -233,6 +233,10 @@ Bug Fixes documents in the same index segment had a value of true. (Robert Muir, hossman, yonik) +* SOLR-5233: The "deleteshard" collections API doesn't wait for cluster state to update, + can fail if some nodes of the deleted shard were down and had incorrect logging. + (Christine Poerschke, shalin) + Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java index 80b07c899f7..9b357890155 100644 --- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java +++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java @@ -598,10 +598,11 @@ public class Overseer { * Remove collection slice from cloudstate */ private ClusterState removeShard(final ClusterState clusterState, ZkNodeProps message) { - final String collection = message.getStr(ZkStateReader.COLLECTION_PROP); final String sliceId = message.getStr(ZkStateReader.SHARD_ID_PROP); + log.info("Removing collection: " + collection + " shard: " + sliceId + " from clusterstate"); + final Map newCollections = new LinkedHashMap(clusterState.getCollectionStates()); // shallow copy DocCollection coll = newCollections.get(collection); diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java index 902bf027a4c..89f244ca836 100644 --- a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java +++ b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionProcessor.java @@ -820,7 +820,8 @@ public class OverseerCollectionProcessor implements Runnable, ClosableThread { } while (srsp != null); ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, - Overseer.REMOVESHARD, ZkStateReader.COLLECTION_PROP, collection); + Overseer.REMOVESHARD, ZkStateReader.COLLECTION_PROP, collection, + ZkStateReader.SHARD_ID_PROP, sliceId); Overseer.getInQueue(zkStateReader.getZkClient()).offer(ZkStateReader.toJSON(m)); // wait for a while until we don't see the shard @@ -829,7 +830,7 @@ public class OverseerCollectionProcessor implements Runnable, ClosableThread { boolean removed = false; while (System.currentTimeMillis() < timeout) { Thread.sleep(100); - removed = zkStateReader.getClusterState().getSlice(collection, message.getStr("name")) == null; + removed = zkStateReader.getClusterState().getSlice(collection, sliceId) == null; if (removed) { Thread.sleep(100); // just a bit of time so it's more likely other readers see on return break; @@ -837,15 +838,15 @@ public class OverseerCollectionProcessor implements Runnable, ClosableThread { } if (!removed) { throw new SolrException(ErrorCode.SERVER_ERROR, - "Could not fully remove collection: " + collection + " shard: " + message.getStr("name")); + "Could not fully remove collection: " + collection + " shard: " + sliceId); } - log.info("Successfully deleted collection " + collection + ", shard: " + message.getStr("name")); + log.info("Successfully deleted collection: " + collection + ", shard: " + sliceId); } catch (SolrException e) { throw e; } catch (Exception e) { - throw new SolrException(ErrorCode.SERVER_ERROR, "Error executing delete operation for collection: " + collection + " shard: " + message.getStr("name"), e); + throw new SolrException(ErrorCode.SERVER_ERROR, "Error executing delete operation for collection: " + collection + " shard: " + sliceId, e); } } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index f859289deb1..ddcd87afb37 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -329,11 +329,11 @@ public class CollectionsHandler extends RequestHandlerBase { private void handleDeleteShardAction(SolrQueryRequest req, SolrQueryResponse rsp) throws InterruptedException, KeeperException { log.info("Deleting Shard : " + req.getParamString()); - String name = req.getParams().required().get("collection"); - String shard = req.getParams().required().get("shard"); + String name = req.getParams().required().get(ZkStateReader.COLLECTION_PROP); + String shard = req.getParams().required().get(ZkStateReader.SHARD_ID_PROP); Map props = new HashMap(); - props.put("collection", name); + props.put(ZkStateReader.COLLECTION_PROP, name); props.put(Overseer.QUEUE_OPERATION, OverseerCollectionProcessor.DELETESHARD); props.put(ZkStateReader.SHARD_ID_PROP, shard); diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java index 2455616f82b..9d7baffe52d 100644 --- a/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java @@ -101,18 +101,18 @@ public class DeleteShardTest extends AbstractFullDistribZkTestBase { deleteShard(SHARD1); - confirmShardDeletion(); + confirmShardDeletion(SHARD1); } - protected void confirmShardDeletion() throws SolrServerException, KeeperException, + protected void confirmShardDeletion(String shard) throws SolrServerException, KeeperException, InterruptedException { ZkStateReader zkStateReader = cloudClient.getZkStateReader(); - ClusterState clusterState = null; + ClusterState clusterState = zkStateReader.getClusterState(); int counter = 10; while (counter-- > 0) { zkStateReader.updateClusterState(true); clusterState = zkStateReader.getClusterState(); - if (clusterState.getSlice("collection1", SHARD1) == null) { + if (clusterState.getSlice("collection1", shard) == null) { break; } Thread.sleep(1000);