SOLR-9439: The delete shard API has been made more resilient against failures resulting from non-existent cores.

This commit is contained in:
Shalin Shekhar Mangar 2016-08-30 23:44:22 +05:30
parent 2700b95211
commit 02b97a29b7
4 changed files with 64 additions and 47 deletions

View File

@ -81,7 +81,8 @@ Bug Fixes
* SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure. (shalin)
* SOLR-9439: Shard split clean up logic for older failed splits is faulty. (shalin)
* SOLR-9439: Shard split clean up logic for older failed splits is faulty. The delete shard API
has also been made more resilient against failures resulting from non-existent cores. (shalin)
* SOLR-9430: Fix locale lookup in DIH <propertyWriter/> to use BCP47 language tags
to be consistent with other places in Solr. Language names still work for backwards

View File

@ -16,10 +16,14 @@
* limitations under the License.
*/
package org.apache.solr.cloud;
import java.lang.invoke.MethodHandles;
import java.util.Collections;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.apache.solr.cloud.OverseerCollectionMessageHandler.Cmd;
@ -27,18 +31,23 @@ import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.Utils;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.util.TimeOut;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.NODE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICA;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
@ -87,24 +96,42 @@ public class DeleteShardCmd implements Cmd {
inQueue.offer(Utils.toJSON(m));
}
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
String asyncId = message.getStr(ASYNC);
Map<String, String> requestMap = null;
if (asyncId != null) {
requestMap = new HashMap<>(slice.getReplicas().size(), 1.0f);
}
try {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.UNLOAD.toString());
params.set(CoreAdminParams.DELETE_INDEX, message.getBool(CoreAdminParams.DELETE_INDEX, true));
params.set(CoreAdminParams.DELETE_INSTANCE_DIR, message.getBool(CoreAdminParams.DELETE_INSTANCE_DIR, true));
params.set(CoreAdminParams.DELETE_DATA_DIR, message.getBool(CoreAdminParams.DELETE_DATA_DIR, true));
ocmh.sliceCmd(clusterState, params, null, slice, shardHandler, asyncId, requestMap);
ocmh.processResponses(results, shardHandler, true, "Failed to delete shard", asyncId, requestMap, Collections.emptySet());
List<ZkNodeProps> replicas = getReplicasForSlice(collectionName, slice);
CountDownLatch cleanupLatch = new CountDownLatch(replicas.size());
for (ZkNodeProps r : replicas) {
final ZkNodeProps replica = r.plus(message.getProperties()).plus("parallel", "true").plus(ASYNC, asyncId);
log.info("Deleting replica for collection={} shard={} on node={}", replica.getStr(COLLECTION_PROP), replica.getStr(SHARD_ID_PROP), replica.getStr(CoreAdminParams.NODE));
NamedList deleteResult = new NamedList();
try {
((DeleteReplicaCmd)ocmh.commandMap.get(DELETEREPLICA)).deleteReplica(clusterState, replica, deleteResult, () -> {
cleanupLatch.countDown();
if (deleteResult.get("failure") != null) {
synchronized (results) {
results.add("failure", String.format(Locale.ROOT, "Failed to delete replica for collection=%s shard=%s" +
" on node=%s", replica.getStr(COLLECTION_PROP), replica.getStr(SHARD_ID_PROP), replica.getStr(NODE_NAME_PROP)));
}
}
SimpleOrderedMap success = (SimpleOrderedMap) deleteResult.get("success");
if (success != null) {
synchronized (results) {
results.add("success", success);
}
}
});
} catch (KeeperException e) {
log.warn("Error deleting replica: " + r, e);
cleanupLatch.countDown();
} catch (Exception e) {
log.warn("Error deleting replica: " + r, e);
cleanupLatch.countDown();
throw e;
}
}
log.debug("Waiting for delete shard action to complete");
cleanupLatch.await(5, TimeUnit.MINUTES);
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETESHARD.toLower(), ZkStateReader.COLLECTION_PROP,
collectionName, ZkStateReader.SHARD_ID_PROP, sliceId);
@ -114,7 +141,7 @@ public class DeleteShardCmd implements Cmd {
// wait for a while until we don't see the shard
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
boolean removed = false;
while (! timeout.hasTimedOut()) {
while (!timeout.hasTimedOut()) {
Thread.sleep(100);
DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
removed = collection.getSlice(sliceId) == null;
@ -129,7 +156,6 @@ public class DeleteShardCmd implements Cmd {
}
log.info("Successfully deleted collection: " + collectionName + ", shard: " + sliceId);
} catch (SolrException e) {
throw e;
} catch (Exception e) {
@ -137,4 +163,18 @@ public class DeleteShardCmd implements Cmd {
"Error executing delete operation for collection: " + collectionName + " shard: " + sliceId, e);
}
}
private List<ZkNodeProps> getReplicasForSlice(String collectionName, Slice slice) {
List<ZkNodeProps> sourceReplicas = new ArrayList<>();
for (Replica replica : slice.getReplicas()) {
ZkNodeProps props = new ZkNodeProps(
COLLECTION_PROP, collectionName,
SHARD_ID_PROP, slice.getName(),
ZkStateReader.CORE_NAME_PROP, replica.getCoreName(),
ZkStateReader.REPLICA_PROP, replica.getName(),
CoreAdminParams.NODE, replica.getNodeName());
sourceReplicas.add(props);
}
return sourceReplicas;
}
}

View File

@ -219,8 +219,6 @@ public class SplitShardCmd implements Cmd {
ZkNodeProps m = new ZkNodeProps(propMap);
try {
ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList());
} catch (SolrException e) {
throwIfNotNonExistentCoreException(subSlice, e);
} catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
e);
@ -233,7 +231,7 @@ public class SplitShardCmd implements Cmd {
if (oldShardsDeleted) {
// refresh the locally cached cluster state
zkStateReader.forceUpdateCollection(collectionName);
// we know we have the latest because otherwise deleteshard would have failed
clusterState = zkStateReader.getClusterState();
collection = clusterState.getCollection(collectionName);
}
@ -471,24 +469,4 @@ public class SplitShardCmd implements Cmd {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, null, e);
}
}
private void throwIfNotNonExistentCoreException(String subSlice, SolrException e) {
Throwable t = e;
String cause = null;
while (t != null) {
if (t instanceof SolrException) {
SolrException solrException = (SolrException) t;
cause = solrException.getMetadata("cause");
if (cause != null && !"NonExistentCore".equals(cause)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
e);
}
}
t = t.getCause();
}
if (!"NonExistentCore".equals(cause)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
e);
}
}
}

View File

@ -1019,9 +1019,7 @@ public class CoreContainer {
CoreDescriptor cd = solrCores.getCoreDescriptor(name);
if (cd == null) {
SolrException solrException = new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
solrException.setMetadata("cause", "NonExistentCore");
throw solrException;
throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
}
boolean close = solrCores.isLoadedNotPendingClose(name);