mirror of https://github.com/apache/lucene.git
SOLR-9439: The delete shard API has been made more resilient against failures resulting from non-existent cores.
This commit is contained in:
parent
2700b95211
commit
02b97a29b7
|
@ -81,7 +81,8 @@ Bug Fixes
|
||||||
|
|
||||||
* SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure. (shalin)
|
* SOLR-9445: Admin requests are retried by CloudSolrClient and LBHttpSolrClient on failure. (shalin)
|
||||||
|
|
||||||
* SOLR-9439: Shard split clean up logic for older failed splits is faulty. (shalin)
|
* SOLR-9439: Shard split clean up logic for older failed splits is faulty. The delete shard API
|
||||||
|
has also been made more resilient against failures resulting from non-existent cores. (shalin)
|
||||||
|
|
||||||
* SOLR-9430: Fix locale lookup in DIH <propertyWriter/> to use BCP47 language tags
|
* SOLR-9430: Fix locale lookup in DIH <propertyWriter/> to use BCP47 language tags
|
||||||
to be consistent with other places in Solr. Language names still work for backwards
|
to be consistent with other places in Solr. Language names still work for backwards
|
||||||
|
|
|
@ -16,10 +16,14 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.cloud;
|
package org.apache.solr.cloud;
|
||||||
|
|
||||||
import java.lang.invoke.MethodHandles;
|
import java.lang.invoke.MethodHandles;
|
||||||
import java.util.Collections;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.CountDownLatch;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import org.apache.solr.cloud.OverseerCollectionMessageHandler.Cmd;
|
import org.apache.solr.cloud.OverseerCollectionMessageHandler.Cmd;
|
||||||
|
@ -27,18 +31,23 @@ import org.apache.solr.cloud.overseer.OverseerAction;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.cloud.ClusterState;
|
import org.apache.solr.common.cloud.ClusterState;
|
||||||
import org.apache.solr.common.cloud.DocCollection;
|
import org.apache.solr.common.cloud.DocCollection;
|
||||||
|
import org.apache.solr.common.cloud.Replica;
|
||||||
import org.apache.solr.common.cloud.Slice;
|
import org.apache.solr.common.cloud.Slice;
|
||||||
import org.apache.solr.common.cloud.ZkNodeProps;
|
import org.apache.solr.common.cloud.ZkNodeProps;
|
||||||
import org.apache.solr.common.cloud.ZkStateReader;
|
import org.apache.solr.common.cloud.ZkStateReader;
|
||||||
import org.apache.solr.common.params.CoreAdminParams;
|
import org.apache.solr.common.params.CoreAdminParams;
|
||||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
|
||||||
import org.apache.solr.common.util.NamedList;
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||||
import org.apache.solr.common.util.Utils;
|
import org.apache.solr.common.util.Utils;
|
||||||
import org.apache.solr.handler.component.ShardHandler;
|
|
||||||
import org.apache.solr.util.TimeOut;
|
import org.apache.solr.util.TimeOut;
|
||||||
|
import org.apache.zookeeper.KeeperException;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
|
||||||
|
import static org.apache.solr.common.cloud.ZkStateReader.NODE_NAME_PROP;
|
||||||
|
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
|
||||||
|
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICA;
|
||||||
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD;
|
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD;
|
||||||
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
|
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
|
||||||
|
|
||||||
|
@ -87,24 +96,42 @@ public class DeleteShardCmd implements Cmd {
|
||||||
inQueue.offer(Utils.toJSON(m));
|
inQueue.offer(Utils.toJSON(m));
|
||||||
}
|
}
|
||||||
|
|
||||||
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
|
|
||||||
|
|
||||||
String asyncId = message.getStr(ASYNC);
|
String asyncId = message.getStr(ASYNC);
|
||||||
Map<String, String> requestMap = null;
|
|
||||||
if (asyncId != null) {
|
|
||||||
requestMap = new HashMap<>(slice.getReplicas().size(), 1.0f);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
List<ZkNodeProps> replicas = getReplicasForSlice(collectionName, slice);
|
||||||
params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.UNLOAD.toString());
|
CountDownLatch cleanupLatch = new CountDownLatch(replicas.size());
|
||||||
params.set(CoreAdminParams.DELETE_INDEX, message.getBool(CoreAdminParams.DELETE_INDEX, true));
|
for (ZkNodeProps r : replicas) {
|
||||||
params.set(CoreAdminParams.DELETE_INSTANCE_DIR, message.getBool(CoreAdminParams.DELETE_INSTANCE_DIR, true));
|
final ZkNodeProps replica = r.plus(message.getProperties()).plus("parallel", "true").plus(ASYNC, asyncId);
|
||||||
params.set(CoreAdminParams.DELETE_DATA_DIR, message.getBool(CoreAdminParams.DELETE_DATA_DIR, true));
|
log.info("Deleting replica for collection={} shard={} on node={}", replica.getStr(COLLECTION_PROP), replica.getStr(SHARD_ID_PROP), replica.getStr(CoreAdminParams.NODE));
|
||||||
|
NamedList deleteResult = new NamedList();
|
||||||
ocmh.sliceCmd(clusterState, params, null, slice, shardHandler, asyncId, requestMap);
|
try {
|
||||||
|
((DeleteReplicaCmd)ocmh.commandMap.get(DELETEREPLICA)).deleteReplica(clusterState, replica, deleteResult, () -> {
|
||||||
ocmh.processResponses(results, shardHandler, true, "Failed to delete shard", asyncId, requestMap, Collections.emptySet());
|
cleanupLatch.countDown();
|
||||||
|
if (deleteResult.get("failure") != null) {
|
||||||
|
synchronized (results) {
|
||||||
|
results.add("failure", String.format(Locale.ROOT, "Failed to delete replica for collection=%s shard=%s" +
|
||||||
|
" on node=%s", replica.getStr(COLLECTION_PROP), replica.getStr(SHARD_ID_PROP), replica.getStr(NODE_NAME_PROP)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SimpleOrderedMap success = (SimpleOrderedMap) deleteResult.get("success");
|
||||||
|
if (success != null) {
|
||||||
|
synchronized (results) {
|
||||||
|
results.add("success", success);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (KeeperException e) {
|
||||||
|
log.warn("Error deleting replica: " + r, e);
|
||||||
|
cleanupLatch.countDown();
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Error deleting replica: " + r, e);
|
||||||
|
cleanupLatch.countDown();
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.debug("Waiting for delete shard action to complete");
|
||||||
|
cleanupLatch.await(5, TimeUnit.MINUTES);
|
||||||
|
|
||||||
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETESHARD.toLower(), ZkStateReader.COLLECTION_PROP,
|
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETESHARD.toLower(), ZkStateReader.COLLECTION_PROP,
|
||||||
collectionName, ZkStateReader.SHARD_ID_PROP, sliceId);
|
collectionName, ZkStateReader.SHARD_ID_PROP, sliceId);
|
||||||
|
@ -114,7 +141,7 @@ public class DeleteShardCmd implements Cmd {
|
||||||
// wait for a while until we don't see the shard
|
// wait for a while until we don't see the shard
|
||||||
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
|
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
|
||||||
boolean removed = false;
|
boolean removed = false;
|
||||||
while (! timeout.hasTimedOut()) {
|
while (!timeout.hasTimedOut()) {
|
||||||
Thread.sleep(100);
|
Thread.sleep(100);
|
||||||
DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
|
DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
|
||||||
removed = collection.getSlice(sliceId) == null;
|
removed = collection.getSlice(sliceId) == null;
|
||||||
|
@ -129,7 +156,6 @@ public class DeleteShardCmd implements Cmd {
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Successfully deleted collection: " + collectionName + ", shard: " + sliceId);
|
log.info("Successfully deleted collection: " + collectionName + ", shard: " + sliceId);
|
||||||
|
|
||||||
} catch (SolrException e) {
|
} catch (SolrException e) {
|
||||||
throw e;
|
throw e;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -137,4 +163,18 @@ public class DeleteShardCmd implements Cmd {
|
||||||
"Error executing delete operation for collection: " + collectionName + " shard: " + sliceId, e);
|
"Error executing delete operation for collection: " + collectionName + " shard: " + sliceId, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<ZkNodeProps> getReplicasForSlice(String collectionName, Slice slice) {
|
||||||
|
List<ZkNodeProps> sourceReplicas = new ArrayList<>();
|
||||||
|
for (Replica replica : slice.getReplicas()) {
|
||||||
|
ZkNodeProps props = new ZkNodeProps(
|
||||||
|
COLLECTION_PROP, collectionName,
|
||||||
|
SHARD_ID_PROP, slice.getName(),
|
||||||
|
ZkStateReader.CORE_NAME_PROP, replica.getCoreName(),
|
||||||
|
ZkStateReader.REPLICA_PROP, replica.getName(),
|
||||||
|
CoreAdminParams.NODE, replica.getNodeName());
|
||||||
|
sourceReplicas.add(props);
|
||||||
|
}
|
||||||
|
return sourceReplicas;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -219,8 +219,6 @@ public class SplitShardCmd implements Cmd {
|
||||||
ZkNodeProps m = new ZkNodeProps(propMap);
|
ZkNodeProps m = new ZkNodeProps(propMap);
|
||||||
try {
|
try {
|
||||||
ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList());
|
ocmh.commandMap.get(DELETESHARD).call(clusterState, m, new NamedList());
|
||||||
} catch (SolrException e) {
|
|
||||||
throwIfNotNonExistentCoreException(subSlice, e);
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
|
||||||
e);
|
e);
|
||||||
|
@ -233,7 +231,7 @@ public class SplitShardCmd implements Cmd {
|
||||||
|
|
||||||
if (oldShardsDeleted) {
|
if (oldShardsDeleted) {
|
||||||
// refresh the locally cached cluster state
|
// refresh the locally cached cluster state
|
||||||
zkStateReader.forceUpdateCollection(collectionName);
|
// we know we have the latest because otherwise deleteshard would have failed
|
||||||
clusterState = zkStateReader.getClusterState();
|
clusterState = zkStateReader.getClusterState();
|
||||||
collection = clusterState.getCollection(collectionName);
|
collection = clusterState.getCollection(collectionName);
|
||||||
}
|
}
|
||||||
|
@ -471,24 +469,4 @@ public class SplitShardCmd implements Cmd {
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, null, e);
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, null, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void throwIfNotNonExistentCoreException(String subSlice, SolrException e) {
|
|
||||||
Throwable t = e;
|
|
||||||
String cause = null;
|
|
||||||
while (t != null) {
|
|
||||||
if (t instanceof SolrException) {
|
|
||||||
SolrException solrException = (SolrException) t;
|
|
||||||
cause = solrException.getMetadata("cause");
|
|
||||||
if (cause != null && !"NonExistentCore".equals(cause)) {
|
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
|
|
||||||
e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
t = t.getCause();
|
|
||||||
}
|
|
||||||
if (!"NonExistentCore".equals(cause)) {
|
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to delete already existing sub shard: " + subSlice,
|
|
||||||
e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1019,9 +1019,7 @@ public class CoreContainer {
|
||||||
|
|
||||||
CoreDescriptor cd = solrCores.getCoreDescriptor(name);
|
CoreDescriptor cd = solrCores.getCoreDescriptor(name);
|
||||||
if (cd == null) {
|
if (cd == null) {
|
||||||
SolrException solrException = new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
|
throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
|
||||||
solrException.setMetadata("cause", "NonExistentCore");
|
|
||||||
throw solrException;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean close = solrCores.isLoadedNotPendingClose(name);
|
boolean close = solrCores.isLoadedNotPendingClose(name);
|
||||||
|
|
Loading…
Reference in New Issue