SOLR-8551: Make collection deletion more robust.

This commit is contained in:
markrmiller 2016-02-08 21:39:01 -05:00
parent ea21b8fae8
commit 899f1fcf74
10 changed files with 202 additions and 33 deletions

View File

@ -477,7 +477,9 @@ Bug Fixes
* SOLR-8651: The commitWithin parameter is not passed on for deleteById in UpdateRequest in
distributed queries (Jessica Cheng Mallet via Erick Erickson)
* SOLR-8551: Make collection deletion more robust. (Mark Miller)
Optimizations
----------------------

View File

@ -37,6 +37,7 @@ import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient.RemoteSolrException;
import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
import org.apache.solr.client.solrj.request.CoreAdminRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
@ -47,6 +48,7 @@ import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.cloud.rule.ReplicaAssigner;
import org.apache.solr.cloud.rule.ReplicaAssigner.Position;
import org.apache.solr.cloud.rule.Rule;
import org.apache.solr.common.NonExistentCoreException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.Aliases;
@ -67,14 +69,12 @@ import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.CoreAdminParams.CoreAdminAction;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.common.util.Utils;
import org.apache.solr.handler.admin.ClusterStatus;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.handler.component.ShardHandlerFactory;
import org.apache.solr.handler.component.ShardRequest;
@ -108,7 +108,6 @@ import static org.apache.solr.common.params.CollectionParams.CollectionAction.AD
import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICAPROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDROLE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.BALANCESHARDUNIQUE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CLUSTERSTATUS;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESHARD;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETE;
@ -769,6 +768,14 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
private void deleteCollection(ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException {
final String collection = message.getStr(NAME);
try {
if (zkStateReader.getClusterState().getCollectionOrNull(collection) == null) {
if (zkStateReader.getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection, true)) {
// if the collection is not in the clusterstate, but is listed in zk, do nothing, it will just
// be removed in the finally - we cannot continue, because the below code will error if the collection
// is not in the clusterstate
return;
}
}
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CoreAdminParams.ACTION, CoreAdminAction.UNLOAD.toString());
params.set(CoreAdminParams.DELETE_INSTANCE_DIR, true);
@ -779,7 +786,11 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
if (asyncId != null) {
requestMap = new HashMap<>();
}
collectionCmd(message, params, results, null, asyncId, requestMap);
Set<String> okayExceptions = new HashSet<>(1);
okayExceptions.add(NonExistentCoreException.class.getName());
collectionCmd(message, params, results, null, asyncId, requestMap, okayExceptions);
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETE.toLower(), NAME, collection);
Overseer.getInQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m));
@ -1002,7 +1013,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
sendShardRequest(nodeName, params, shardHandler, async, requestMap);
}
processResponses(results, shardHandler, true, "Failed to create shard", async, requestMap);
processResponses(results, shardHandler, true, "Failed to create shard", async, requestMap, Collections.emptySet());
log.info("Finished create command on all shards for collection: " + collectionName);
@ -1398,9 +1409,9 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
UpdateResponse updateResponse = null;
try {
updateResponse = softCommit(coreUrl);
processResponse(results, null, coreUrl, updateResponse, slice);
processResponse(results, null, coreUrl, updateResponse, slice, Collections.emptySet());
} catch (Exception e) {
processResponse(results, e, coreUrl, updateResponse, slice);
processResponse(results, e, coreUrl, updateResponse, slice, Collections.emptySet());
throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to call distrib softCommit on: " + coreUrl, e);
}
}
@ -1509,7 +1520,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
params.set(CoreAdminParams.DELETE_INDEX, "true");
sliceCmd(clusterState, params, null, slice, shardHandler, asyncId, requestMap);
processResponses(results, shardHandler, true, "Failed to delete shard", asyncId, requestMap);
processResponses(results, shardHandler, true, "Failed to delete shard", asyncId, requestMap, Collections.emptySet());
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETESHARD.toLower(), ZkStateReader.COLLECTION_PROP,
collection, ZkStateReader.SHARD_ID_PROP, sliceId);
@ -2048,7 +2059,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
}
}
processResponses(results, shardHandler, false, null, async, requestMap);
processResponses(results, shardHandler, false, null, async, requestMap, Collections.emptySet());
log.debug("Finished create command on all shards for collection: "
+ collectionName);
@ -2217,15 +2228,19 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
processResponses(results, shardHandler, true, "ADDREPLICA failed to create replica", asyncId, requestMap);
}
private void processResponses(NamedList results, ShardHandler shardHandler, boolean abortOnError, String msgOnError,
String asyncId, Map<String, String> requestMap) {
processResponses(results, shardHandler, abortOnError, msgOnError, asyncId, requestMap, Collections.emptySet());
}
private void processResponses(NamedList results, ShardHandler shardHandler, boolean abortOnError, String msgOnError,
String asyncId, Map<String, String> requestMap, Set<String> okayExceptions) {
//Processes all shard responses
ShardResponse srsp;
do {
srsp = shardHandler.takeCompletedOrError();
if (srsp != null) {
processResponse(results, srsp);
processResponse(results, srsp, okayExceptions);
Throwable exception = srsp.getException();
if (abortOnError && exception != null) {
// drain pending requests
@ -2294,9 +2309,15 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
}
}
private void collectionCmd(ZkNodeProps message, ModifiableSolrParams params,
NamedList results, Replica.State stateMatcher, String asyncId, Map<String, String> requestMap) {
collectionCmd( message, params, results, stateMatcher, asyncId, requestMap, Collections.emptySet());
}
private void collectionCmd(ZkNodeProps message, ModifiableSolrParams params,
NamedList results, Replica.State stateMatcher, String asyncId, Map<String, String> requestMap, Set<String> okayExceptions) {
log.info("Executing Collection Cmd : " + params);
String collectionName = message.getStr(NAME);
ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
@ -2308,7 +2329,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
sliceCmd(clusterState, params, stateMatcher, slice, shardHandler, asyncId, requestMap);
}
processResponses(results, shardHandler, false, null, asyncId, requestMap);
processResponses(results, shardHandler, false, null, asyncId, requestMap, okayExceptions);
}
@ -2328,19 +2349,24 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
}
}
}
private void processResponse(NamedList results, ShardResponse srsp) {
private void processResponse(NamedList results, ShardResponse srsp, Set<String> okayExceptions) {
Throwable e = srsp.getException();
String nodeName = srsp.getNodeName();
SolrResponse solrResponse = srsp.getSolrResponse();
String shard = srsp.getShard();
processResponse(results, e, nodeName, solrResponse, shard);
processResponse(results, e, nodeName, solrResponse, shard, okayExceptions);
}
@SuppressWarnings("unchecked")
private void processResponse(NamedList results, Throwable e, String nodeName, SolrResponse solrResponse, String shard) {
if (e != null) {
private void processResponse(NamedList results, Throwable e, String nodeName, SolrResponse solrResponse, String shard, Set<String> okayExceptions) {
String rootThrowable = null;
if (e instanceof RemoteSolrException) {
rootThrowable = ((RemoteSolrException) e).getRootThrowable();
}
if (e != null && (rootThrowable == null || !okayExceptions.contains(rootThrowable))) {
log.error("Error from shard: " + shard, e);
SimpleOrderedMap failure = (SimpleOrderedMap) results.get("failure");
@ -2394,7 +2420,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
srsp = shardHandler.takeCompletedOrError();
if (srsp != null) {
NamedList results = new NamedList();
processResponse(results, srsp);
processResponse(results, srsp, Collections.emptySet());
String r = (String) srsp.getSolrResponse().getResponse().get("STATUS");
if (r.equals("running")) {
log.debug("The task is still RUNNING, continuing to wait.");

View File

@ -675,7 +675,7 @@ public class CoreContainer {
if (isShutDown) {
core.close();
throw new IllegalStateException("This CoreContainer has been close");
throw new IllegalStateException("This CoreContainer has been closed");
}
if (cd.isTransient()) {
old = solrCores.putTransientCore(cfg, name, core, loader);

View File

@ -37,6 +37,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.cloud.SyncStrategy;
import org.apache.solr.cloud.ZkController;
import org.apache.solr.common.NonExistentCoreException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
@ -68,6 +69,7 @@ import org.apache.solr.update.processor.UpdateRequestProcessorChain;
import org.apache.solr.util.NumberUtils;
import org.apache.solr.util.PropertiesUtil;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.TestInjection;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -131,13 +133,15 @@ enum CoreAdminOperation {
},
UNLOAD_OP(UNLOAD) {
@Override
public void call(CallInfo callInfo) {
public void call(CallInfo callInfo) throws IOException {
SolrParams params = callInfo.req.getParams();
String cname = params.get(CoreAdminParams.CORE);
boolean deleteIndexDir = params.getBool(CoreAdminParams.DELETE_INDEX, false);
boolean deleteDataDir = params.getBool(CoreAdminParams.DELETE_DATA_DIR, false);
boolean deleteInstanceDir = params.getBool(CoreAdminParams.DELETE_INSTANCE_DIR, false);
callInfo.handler.coreContainer.unload(cname, deleteIndexDir, deleteDataDir, deleteInstanceDir);
assert TestInjection.injectNonExistentCoreExceptionAfterUnload(cname);
}
},
RELOAD_OP(RELOAD) {

View File

@ -42,8 +42,12 @@ public class ResponseUtils {
SolrException solrExc = (SolrException)ex;
code = solrExc.code();
NamedList<String> errorMetadata = solrExc.getMetadata();
if (errorMetadata != null)
info.add("metadata", errorMetadata);
if (errorMetadata == null) {
errorMetadata = new NamedList<>();
}
errorMetadata.add(SolrException.ERROR_CLASS, ex.getClass().getName());
errorMetadata.add(SolrException.ROOT_ERROR_CLASS, SolrException.getRootCause(ex).getClass().getName());
info.add("metadata", errorMetadata);
}
for (Throwable th = ex; th != null; th = th.getCause()) {

View File

@ -26,6 +26,7 @@ import java.util.TimerTask;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.solr.common.NonExistentCoreException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.Pair;
@ -65,14 +66,18 @@ public class TestInjection {
public static String failUpdateRequests = null;
public static String nonExistentCoreExceptionAfterUnload = null;
private static Set<Timer> timers = Collections.synchronizedSet(new HashSet<Timer>());
public static void reset() {
nonGracefullClose = null;
failReplicaRequests = null;
failUpdateRequests = null;
nonExistentCoreExceptionAfterUnload = null;
for (Timer timer : timers) {
timer.cancel();
}
@ -142,6 +147,19 @@ public class TestInjection {
return true;
}
public static boolean injectNonExistentCoreExceptionAfterUnload(String cname) {
if (nonExistentCoreExceptionAfterUnload != null) {
Pair<Boolean,Integer> pair = parseValue(nonExistentCoreExceptionAfterUnload);
boolean enabled = pair.getKey();
int chanceIn100 = pair.getValue();
if (enabled && RANDOM.nextInt(100) >= (100 - chanceIn100)) {
throw new NonExistentCoreException("Core not found to unload: " + cname);
}
}
return true;
}
private static Pair<Boolean,Integer> parseValue(String raw) {
Matcher m = ENABLED_PERCENT.matcher(raw);
if (!m.matches()) throw new RuntimeException("No match, probably bad syntax: " + raw);
@ -153,5 +171,4 @@ public class TestInjection {
return new Pair<>(Boolean.parseBoolean(val), Integer.parseInt(percent));
}
}

View File

@ -68,6 +68,7 @@ import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrInfoMBean.Category;
import org.apache.solr.util.TestInjection;
import org.apache.solr.util.TimeOut;
import org.junit.Test;
import org.slf4j.Logger;
@ -187,12 +188,9 @@ public class CollectionsAPIDistributedZkTest extends AbstractFullDistribZkTestBa
params.set("name", collectionName);
QueryRequest request = new QueryRequest(params);
request.setPath("/admin/collections");
try {
makeRequest(baseUrl, request);
fail("Expected to fail, because collection is not in clusterstate");
} catch (RemoteSolrException e) {
}
// there are remnants of the collection in zk, should work
makeRequest(baseUrl, request);
assertCollectionNotExists(collectionName, 45);
@ -238,6 +236,88 @@ public class CollectionsAPIDistributedZkTest extends AbstractFullDistribZkTestBa
makeRequest(baseUrl, request);
}
private void deleteCollectionOnlyInZk() throws Exception {
final String baseUrl = getBaseUrl((HttpSolrClient) clients.get(0));
String collectionName = "onlyinzk";
cloudClient.getZkStateReader().getZkClient().makePath(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collectionName, true);
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("action", CollectionAction.DELETE.toString());
params.set("name", collectionName);
QueryRequest request = new QueryRequest(params);
request.setPath("/admin/collections");
makeRequest(baseUrl, request);
assertCollectionNotExists(collectionName, 45);
// now creating that collection should work
params = new ModifiableSolrParams();
params.set("action", CollectionAction.CREATE.toString());
params.set("name", collectionName);
params.set("numShards", 2);
request = new QueryRequest(params);
request.setPath("/admin/collections");
if (secondConfigSet) {
params.set("collection.configName", "conf1");
}
makeRequest(baseUrl, request);
waitForRecoveriesToFinish(collectionName, false);
params = new ModifiableSolrParams();
params.set("action", CollectionAction.DELETE.toString());
params.set("name", collectionName);
request = new QueryRequest(params);
request.setPath("/admin/collections");
makeRequest(baseUrl, request);
}
private void deleteCollectionWithUnloadedCore() throws Exception {
final String baseUrl = getBaseUrl((HttpSolrClient) clients.get(0));
String collectionName = "corealreadyunloaded";
try (SolrClient client = createNewSolrClient("", baseUrl)) {
createCollection(null, collectionName, 2, 1, 2, client, null, "conf1");
}
waitForRecoveriesToFinish(collectionName, false);
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("action", CollectionAction.DELETE.toString());
params.set("name", collectionName);
QueryRequest request = new QueryRequest(params);
request.setPath("/admin/collections");
NamedList<Object> result = makeRequest(baseUrl, request);
System.out.println("result:" + result);
Object failure = result.get("failure");
assertNull("We expect no failures", failure);
assertCollectionNotExists(collectionName, 45);
// now creating that collection should work
params = new ModifiableSolrParams();
params.set("action", CollectionAction.CREATE.toString());
params.set("name", collectionName);
params.set("numShards", 2);
request = new QueryRequest(params);
request.setPath("/admin/collections");
if (secondConfigSet) {
params.set("collection.configName", "conf1");
}
makeRequest(baseUrl, request);
params = new ModifiableSolrParams();
params.set("action", CollectionAction.DELETE.toString());
params.set("name", collectionName);
request = new QueryRequest(params);
request.setPath("/admin/collections");
makeRequest(baseUrl, request);
}
private void deleteCollectionWithDownNodes() throws Exception {
String baseUrl = getBaseUrl((HttpSolrClient) clients.get(0));
@ -291,6 +371,7 @@ public class CollectionsAPIDistributedZkTest extends AbstractFullDistribZkTestBa
private NamedList<Object> makeRequest(String baseUrl, SolrRequest request)
throws SolrServerException, IOException {
try (SolrClient client = createNewSolrClient("", baseUrl)) {
((HttpSolrClient) client).setSoTimeout(30000);
return client.request(request);
}
}
@ -298,7 +379,6 @@ public class CollectionsAPIDistributedZkTest extends AbstractFullDistribZkTestBa
private void testErrorHandling() throws Exception {
final String baseUrl = getBaseUrl((HttpSolrClient) clients.get(0));
// try a bad action
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("action", "BADACTION");

View File

@ -0,0 +1,25 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.common;
public class NonExistentCoreException extends SolrException {
public NonExistentCoreException(String msg) {
super(ErrorCode.BAD_REQUEST, msg);
}
}

View File

@ -31,6 +31,8 @@ import org.slf4j.MDC;
*/
public class SolrException extends RuntimeException {
public static final String ROOT_ERROR_CLASS = "root-error-class";
public static final String ERROR_CLASS = "error-class";
final private Map mdcContext;
/**
@ -126,6 +128,14 @@ public class SolrException extends RuntimeException {
metadata = new NamedList<String>();
metadata.add(key, value);
}
public String getThrowable() {
return getMetadata(ERROR_CLASS);
}
public String getRootThrowable() {
return getMetadata(ROOT_ERROR_CLASS);
}
public void log(Logger log) { log(log,this); }
public static void log(Logger log, Throwable e) {

View File

@ -1789,6 +1789,7 @@ public abstract class AbstractFullDistribZkTestBase extends AbstractDistribZkTes
protected void assertCollectionNotExists(String collectionName, int timeoutSeconds) throws Exception {
waitForCollectionToDisappear(collectionName, getCommonCloudSolrClient().getZkStateReader(), false, true, timeoutSeconds);
assertFalse(cloudClient.getZkStateReader().getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collectionName, true));
}