SOLR-9439: The delete shard API has been made more resilient against failures resulting from non-existent cores.

This commit is contained in:
Shalin Shekhar Mangar 2016-08-30 23:44:22 +05:30
parent 2700b95211
commit 02b97a29b7
4 changed files with 64 additions and 47 deletions

View File

@ -81,7 +81,8 @@ Bug Fixes
* SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure. (shalin) * SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure. (shalin)
* SOLR-9439: Shard split clean up logic for older failed splits is faulty. (shalin) * SOLR-9439: Shard split clean up logic for older failed splits is faulty. The delete shard API
has also been made more resilient against failures resulting from non-existent cores. (shalin)
* SOLR-9430: Fix locale lookup in DIH <propertyWriter/> to use BCP47 language tags * SOLR-9430: Fix locale lookup in DIH <propertyWriter/> to use BCP47 language tags
to be consistent with other places in Solr. Language names still work for backwards to be consistent with other places in Solr. Language names still work for backwards

View File

@ -16,10 +16,14 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.solr.cloud; package org.apache.solr.cloud;
import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodHandles;
import java.util.Collections; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.apache.solr.cloud.OverseerCollectionMessageHandler.Cmd; import org.apache.solr.cloud.OverseerCollectionMessageHandler.Cmd;
@ -27,18 +31,23 @@ import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.Utils; import org.apache.solr.common.util.Utils;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.util.TimeOut; import org.apache.solr.util.TimeOut;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.NODE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICA;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD; import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC; import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
@ -87,24 +96,42 @@ public class DeleteShardCmd implements Cmd {
inQueue.offer(Utils.toJSON(m)); inQueue.offer(Utils.toJSON(m));
} }
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
String asyncId = message.getStr(ASYNC); String asyncId = message.getStr(ASYNC);
Map<String, String> requestMap = null;
if (asyncId != null) {
requestMap = new HashMap<>(slice.getReplicas().size(), 1.0f);
}
try { try {
ModifiableSolrParams params = new ModifiableSolrParams(); List<ZkNodeProps> replicas = getReplicasForSlice(collectionName, slice);
params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.UNLOAD.toString()); CountDownLatch cleanupLatch = new CountDownLatch(replicas.size());
params.set(CoreAdminParams.DELETE_INDEX, message.getBool(CoreAdminParams.DELETE_INDEX, true)); for (ZkNodeProps r : replicas) {
params.set(CoreAdminParams.DELETE_INSTANCE_DIR, message.getBool(CoreAdminParams.DELETE_INSTANCE_DIR, true)); final ZkNodeProps replica = r.plus(message.getProperties()).plus("parallel", "true").plus(ASYNC, asyncId);
params.set(CoreAdminParams.DELETE_DATA_DIR, message.getBool(CoreAdminParams.DELETE_DATA_DIR, true)); log.info("Deleting replica for collection={} shard={} on node={}", replica.getStr(COLLECTION_PROP), replica.getStr(SHARD_ID_PROP), replica.getStr(CoreAdminParams.NODE));
NamedList deleteResult = new NamedList();
ocmh.sliceCmd(clusterState, params, null, slice, shardHandler, asyncId, requestMap); try {
((DeleteReplicaCmd)ocmh.commandMap.get(DELETEREPLICA)).deleteReplica(clusterState, replica, deleteResult, () -> {
ocmh.processResponses(results, shardHandler, true, "Failed to delete shard", asyncId, requestMap, Collections.emptySet()); cleanupLatch.countDown();
if (deleteResult.get("failure") != null) {
synchronized (results) {
results.add("failure", String.format(Locale.ROOT, "Failed to delete replica for collection=%s shard=%s" +
" on node=%s", replica.getStr(COLLECTION_PROP), replica.getStr(SHARD_ID_PROP), replica.getStr(NODE_NAME_PROP)));
}
}
SimpleOrderedMap success = (SimpleOrderedMap) deleteResult.get("success");
if (success != null) {
synchronized (results) {
results.add("success", success);
}
}
});
} catch (KeeperException e) {
log.warn("Error deleting replica: " + r, e);
cleanupLatch.countDown();
} catch (Exception e) {
log.warn("Error deleting replica: " + r, e);
cleanupLatch.countDown();
throw e;
}
}
log.debug("Waiting for delete shard action to complete");
cleanupLatch.await(5, TimeUnit.MINUTES);
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETESHARD.toLower(), ZkStateReader.COLLECTION_PROP, ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETESHARD.toLower(), ZkStateReader.COLLECTION_PROP,
collectionName, ZkStateReader.SHARD_ID_PROP, sliceId); collectionName, ZkStateReader.SHARD_ID_PROP, sliceId);
@ -114,7 +141,7 @@ public class DeleteShardCmd implements Cmd {
// wait for a while until we don't see the shard // wait for a while until we don't see the shard
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS); TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
boolean removed = false; boolean removed = false;
while (! timeout.hasTimedOut()) { while (!timeout.hasTimedOut()) {
Thread.sleep(100); Thread.sleep(100);
DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName); DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
removed = collection.getSlice(sliceId) == null; removed = collection.getSlice(sliceId) == null;
@ -129,7 +156,6 @@ public class DeleteShardCmd implements Cmd {
} }
log.info("Successfully deleted collection: " + collectionName + ", shard: " + sliceId); log.info("Successfully deleted collection: " + collectionName + ", shard: " + sliceId);
} catch (SolrException e) { } catch (SolrException e) {
throw e; throw e;
} catch (Exception e) { } catch (Exception e) {
@ -137,4 +163,18 @@ public class DeleteShardCmd implements Cmd {
"Error executing delete operation for collection: " + collectionName + " shard: " + sliceId, e); "Error executing delete operation for collection: " + collectionName + " shard: " + sliceId, e);
} }
} }
private List<ZkNodeProps> getReplicasForSlice(String collectionName, Slice slice) {
List<ZkNodeProps> sourceReplicas = new ArrayList<>();
for (Replica replica : slice.getReplicas()) {
ZkNodeProps props = new ZkNodeProps(
COLLECTION_PROP, collectionName,
SHARD_ID_PROP, slice.getName(),
ZkStateReader.CORE_NAME_PROP, replica.getCoreName(),
ZkStateReader.REPLICA_PROP, replica.getName(),
CoreAdminParams.NODE, replica.getNodeName());
sourceReplicas.add(props);
}
return sourceReplicas;
}
} }

View File

@ -219,8 +219,6 @@ public class SplitShardCmd implements Cmd {
ZkNodeProps m = new ZkNodeProps(propMap); ZkNodeProps m = new ZkNodeProps(propMap);
try { try {
ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList()); ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList());
} catch (SolrException e) {
throwIfNotNonExistentCoreException(subSlice, e);
} catch (Exception e) { } catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice, throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
e); e);
@ -233,7 +231,7 @@ public class SplitShardCmd implements Cmd {
if (oldShardsDeleted) { if (oldShardsDeleted) {
// refresh the locally cached cluster state // refresh the locally cached cluster state
zkStateReader.forceUpdateCollection(collectionName); // we know we have the latest because otherwise deleteshard would have failed
clusterState = zkStateReader.getClusterState(); clusterState = zkStateReader.getClusterState();
collection = clusterState.getCollection(collectionName); collection = clusterState.getCollection(collectionName);
} }
@ -471,24 +469,4 @@ public class SplitShardCmd implements Cmd {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, null, e); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, null, e);
} }
} }
private void throwIfNotNonExistentCoreException(String subSlice, SolrException e) {
Throwable t = e;
String cause = null;
while (t != null) {
if (t instanceof SolrException) {
SolrException solrException = (SolrException) t;
cause = solrException.getMetadata("cause");
if (cause != null && !"NonExistentCore".equals(cause)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
e);
}
}
t = t.getCause();
}
if (!"NonExistentCore".equals(cause)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
e);
}
}
} }

View File

@ -1019,9 +1019,7 @@ public class CoreContainer {
CoreDescriptor cd = solrCores.getCoreDescriptor(name); CoreDescriptor cd = solrCores.getCoreDescriptor(name);
if (cd == null) { if (cd == null) {
SolrException solrException = new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]"); throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
solrException.setMetadata("cause", "NonExistentCore");
throw solrException;
} }
boolean close = solrCores.isLoadedNotPendingClose(name); boolean close = solrCores.isLoadedNotPendingClose(name);