SOLR-8279: Add a new test fault injection approach and a new SolrCloud test that stops and starts the cluster while indexing data and with random faults.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1720613 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2015-12-17 17:08:10 +00:00
parent bd66345675
commit fe7ef38b2e
24 changed files with 559 additions and 75 deletions

View File

@ -339,6 +339,9 @@ Other Changes
* SOLR-8410: Add all read paths to 'read' permission in RuleBasedAuthorizationPlugin (noble)
* SOLR-8279: Add a new test fault injection approach and a new SolrCloud test that stops and starts the cluster
while indexing data and with random faults. (Mark Miller)
================== 5.4.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release

View File

@ -373,6 +373,9 @@ public class JettySolrRunner {
* @return the {@link CoreContainer} for this node
*/
public CoreContainer getCoreContainer() {
if (getSolrDispatchFilter() == null || getSolrDispatchFilter().getCores() == null) {
return null;
}
return getSolrDispatchFilter().getCores();
}

View File

@ -351,14 +351,14 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
}
if (isClosed()) {
log.info("Recovery was cancelled");
log.info("RecoveryStrategy has been closed");
break;
}
sendPrepRecoveryCmd(leaderBaseUrl, leaderCoreName, slice);
if (isClosed()) {
log.info("Recovery was cancelled");
log.info("RecoveryStrategy has been closed");
break;
}
@ -420,7 +420,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
}
if (isClosed()) {
log.info("Recovery was cancelled");
log.info("RecoveryStrategy has been closed");
break;
}
@ -431,7 +431,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
replicate(zkController.getNodeName(), core, leaderprops);
if (isClosed()) {
log.info("Recovery was cancelled");
log.info("RecoveryStrategy has been closed");
break;
}
@ -439,7 +439,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
replayed = true;
if (isClosed()) {
log.info("Recovery was cancelled");
log.info("RecoveryStrategy has been closed");
break;
}
@ -494,6 +494,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
try {
if (isClosed()) {
log.info("RecoveryStrategy has been closed");
break;
}
@ -518,7 +519,10 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
double loopCount = Math.min(Math.pow(2, retries), 60);
log.info("Wait {} seconds before trying to recover again ({})", loopCount, retries);
for (int i = 0; i < loopCount; i++) {
if (isClosed()) break; // check if someone closed us
if (isClosed()) {
log.info("RecoveryStrategy has been closed");
break; // check if someone closed us
}
Thread.sleep(STARTING_RECOVERY_DELAY);
}
} catch (InterruptedException e) {
@ -537,7 +541,7 @@ public class RecoveryStrategy extends Thread implements ClosableThread {
core.seedVersionBuckets();
}
log.info("Finished recovery process.");
log.info("Finished recovery process, successful=", Boolean.toString(successfulRecovery));
}

View File

@ -2322,7 +2322,10 @@ public final class ZkController {
private void unregisterConfListener(String confDir, Runnable listener) {
synchronized (confDirectoryListeners) {
final Set<Runnable> listeners = confDirectoryListeners.get(confDir);
assert listeners != null : confDir + " has no more registered listeners, but a live one attempts to unregister!";
if (listeners == null) {
log.warn(confDir + " has no more registered listeners, but a live one attempted to unregister!");
return;
}
if (listeners.remove(listener)) {
log.info("removed listener for config directory [{}]", confDir);
}

View File

@ -43,6 +43,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.ObjectReleaseTracker;
import org.apache.solr.store.blockcache.BlockDirectory;
import org.apache.solr.store.hdfs.HdfsDirectory;
import org.apache.solr.store.hdfs.HdfsLockFactory;
@ -305,6 +306,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
try {
log.info("Closing directory: " + val.path);
val.directory.close();
assert ObjectReleaseTracker.release(val.directory);
} catch (Exception e) {
SolrException.log(log, "Error closing directory", e);
}
@ -347,6 +349,7 @@ public abstract class CachingDirectoryFactory extends DirectoryFactory {
if (directory == null) {
directory = create(fullPath, createLockFactory(rawLockType), dirContext);
assert ObjectReleaseTracker.track(directory);
boolean success = false;
try {
CacheValue newCacheValue = new CacheValue(fullPath, directory);

View File

@ -38,7 +38,6 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.io.FileUtils;
@ -66,6 +65,7 @@ import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.IOUtils;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.ObjectReleaseTracker;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.DirectoryFactory.DirContext;
import org.apache.solr.handler.IndexFetcher;
@ -126,12 +126,6 @@ import static org.apache.solr.common.params.CommonParams.PATH;
public final class SolrCore implements SolrInfoMBean, Closeable {
public static final String version="1.0";
// These should *only* be used for debugging or monitoring purposes
public static final AtomicLong numOpens = new AtomicLong();
public static final AtomicLong numCloses = new AtomicLong();
public static Map<SolrCore,Exception> openHandles = Collections.synchronizedMap(new IdentityHashMap<SolrCore, Exception>());
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final Logger requestLog = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass().getName() + ".Request");
@ -714,6 +708,9 @@ public final class SolrCore implements SolrInfoMBean, Closeable {
IndexSchema schema, NamedList configSetProperties,
CoreDescriptor coreDescriptor, UpdateHandler updateHandler,
IndexDeletionPolicyWrapper delPolicy, SolrCore prev) {
assert ObjectReleaseTracker.track(searcherExecutor); // ensure that in unclean shutdown tests we still close this
checkNotNull(coreDescriptor, "coreDescriptor cannot be null");
this.coreDescriptor = coreDescriptor;
@ -852,13 +849,11 @@ public final class SolrCore implements SolrInfoMBean, Closeable {
seedVersionBuckets();
bufferUpdatesIfConstructing(coreDescriptor);
// For debugging
// numOpens.incrementAndGet();
// openHandles.put(this, new RuntimeException("unclosed core - name:" + getName() + " refs: " + refCount.get()));
this.ruleExpiryLock = new ReentrantLock();
registerConfListener();
assert ObjectReleaseTracker.track(this);
}
public void seedVersionBuckets() {
@ -1227,6 +1222,7 @@ public final class SolrCore implements SolrInfoMBean, Closeable {
throw (Error) e;
}
}
assert ObjectReleaseTracker.release(searcherExecutor);
try {
// Since we waited for the searcherExecutor to shut down,
@ -1266,7 +1262,6 @@ public final class SolrCore implements SolrInfoMBean, Closeable {
}
if( closeHooks != null ) {
for( CloseHook hook : closeHooks ) {
try {
@ -1279,10 +1274,7 @@ public final class SolrCore implements SolrInfoMBean, Closeable {
}
}
}
// For debugging
// numCloses.incrementAndGet();
// openHandles.remove(this);
assert ObjectReleaseTracker.release(this);
}
/** Current core usage count. */

View File

@ -171,8 +171,11 @@ public class SolrDispatchFilter extends BaseSolrFilter {
@Override
public void destroy() {
if (cores != null) {
cores.shutdown();
cores = null;
try {
cores.shutdown();
} finally {
cores = null;
}
}
}

View File

@ -194,7 +194,7 @@ public class CdcrTransactionLog extends TransactionLog {
}
@Override
protected void close() {
public void close() {
try {
if (debug) {
log.debug("Closing tlog" + this);

View File

@ -17,6 +17,18 @@
package org.apache.solr.update;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
@ -42,7 +54,6 @@ import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestInfo;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.FunctionRangeQuery;
import org.apache.solr.search.QParser;
@ -51,21 +62,10 @@ import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.search.function.ValueSourceRangeFilter;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.TestInjection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;
/**
* <code>DirectUpdateHandler2</code> implements an UpdateHandler where documents are added
* directly to the main Lucene index as opposed to adding to a separate smaller index.
@ -740,11 +740,14 @@ public class DirectUpdateHandler2 extends UpdateHandler implements SolrCoreState
}
public static boolean commitOnClose = true; // TODO: make this a real config option?
public static boolean commitOnClose = true; // TODO: make this a real config option or move it to TestInjection
// IndexWriterCloser interface method - called from solrCoreState.decref(this)
@Override
public void closeWriter(IndexWriter writer) throws IOException {
assert TestInjection.injectNonGracefullClose(core.getCoreDescriptor().getCoreContainer());
boolean clearRequestInfo = false;
solrCoreState.getCommitLock().lock();
try {

View File

@ -304,7 +304,7 @@ public class HdfsTransactionLog extends TransactionLog {
}
@Override
protected void close() {
public void close() {
try {
if (debug) {
log.debug("Closing tlog" + this);

View File

@ -17,6 +17,7 @@
package org.apache.solr.update;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
@ -62,7 +63,7 @@ import org.slf4j.LoggerFactory;
* in them (since we know that if the request succeeds, all docs will be committed)
*
*/
public class TransactionLog {
public class TransactionLog implements Closeable {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static boolean debug = log.isDebugEnabled();
private static boolean trace = log.isTraceEnabled();
@ -537,7 +538,7 @@ public class TransactionLog {
}
}
protected void close() {
public void close() {
try {
if (debug) {
log.debug("Closing tlog" + this);

View File

@ -1,5 +1,7 @@
package org.apache.solr.update.processor;
import static org.apache.solr.update.processor.DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -83,15 +85,15 @@ import org.apache.solr.update.UpdateHandler;
import org.apache.solr.update.UpdateLog;
import org.apache.solr.update.VersionBucket;
import org.apache.solr.update.VersionInfo;
import org.apache.solr.util.TestInjection;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.update.processor.DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM;
// NOT mt-safe... create a new processor for each add thread
// TODO: we really should not wait for distrib after local? unless a certain replication factor is asked for
public class DistributedUpdateProcessor extends UpdateRequestProcessor {
public static final String DISTRIB_FROM_SHARD = "distrib.from.shard";
public static final String DISTRIB_FROM_COLLECTION = "distrib.from.collection";
public static final String DISTRIB_FROM_PARENT = "distrib.from.parent";
@ -346,6 +348,9 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
// locally we think we are leader but the request says it came FROMLEADER
// that could indicate a problem, let the full logic below figure it out
} else {
assert TestInjection.injectFailReplicaRequests();
isLeader = false; // we actually might be the leader, but we don't want leader-logic for these types of updates anyway.
forwardToLeader = false;
return nodes;

View File

@ -0,0 +1,142 @@
package org.apache.solr.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Collections;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.Pair;
import org.apache.solr.core.CoreContainer;
/**
* Allows random faults to be injected in running code during test runs.
*/
public class TestInjection {
public static class TestShutdownFailError extends OutOfMemoryError {
public TestShutdownFailError(String msg) {
super(msg);
}
}
private static final Pattern ENABLED_PERCENT = Pattern.compile("(true|false)(?:\\:(\\d+))?$", Pattern.CASE_INSENSITIVE);
private static final Random RANDOM;
static {
// We try to make things reproducible in the context of our tests by initializing the random instance
// based on the current seed
String seed = System.getProperty("tests.seed");
if (seed == null) {
RANDOM = new Random();
} else {
RANDOM = new Random(seed.hashCode());
}
}
public static String nonGracefullClose = null;
public static String failReplicaRequests = null;
private static Set<Timer> timers = Collections.synchronizedSet(new HashSet<Timer>());
public static void reset() {
nonGracefullClose = null;
failReplicaRequests = null;
for (Timer timer : timers) {
timer.cancel();
}
}
public static boolean injectNonGracefullClose(CoreContainer cc) {
if (cc.isShutDown() && nonGracefullClose != null) {
Pair<Boolean,Integer> pair = parseValue(nonGracefullClose);
boolean enabled = pair.getKey();
int chanceIn100 = pair.getValue();
if (enabled && RANDOM.nextInt(100) >= (100 - chanceIn100)) {
if (RANDOM.nextBoolean()) {
throw new TestShutdownFailError("Test exception for non graceful close");
} else {
final Thread cthread = Thread.currentThread();
TimerTask task = new TimerTask() {
@Override
public void run() {
// as long as places that catch interruptedexception reset that
// interrupted status,
// we should only need to do it once
try {
Thread.sleep(RANDOM.nextInt(1000));
} catch (InterruptedException e) {
}
cthread.interrupt();
timers.remove(this);
cancel();
}
};
Timer timer = new Timer();
timers.add(timer);
timer.schedule(task, RANDOM.nextInt(500));
}
}
}
return true;
}
public static boolean injectFailReplicaRequests() {
if (failReplicaRequests != null) {
Pair<Boolean,Integer> pair = parseValue(failReplicaRequests);
boolean enabled = pair.getKey();
int chanceIn100 = pair.getValue();
if (enabled && RANDOM.nextInt(100) >= (100 - chanceIn100)) {
throw new SolrException(ErrorCode.SERVER_ERROR, "Random test update fail");
}
}
return true;
}
private static Pair<Boolean,Integer> parseValue(String raw) {
Matcher m = ENABLED_PERCENT.matcher(raw);
if (!m.matches()) throw new RuntimeException("No match, probably bad syntax: " + raw);
String val = m.group(1);
String percent = "100";
if (m.groupCount() == 2) {
percent = m.group(2);
}
return new Pair<>(Boolean.parseBoolean(val), Integer.parseInt(percent));
}
}

View File

@ -54,6 +54,16 @@
<str name="numRecordsToKeep">${solr.ulog.numRecordsToKeep:100}</str>
<int name="tlogDfsReplication">${solr.ulog.tlogDfsReplication:2}</int>
</updateLog>
<autoCommit>
<maxTime>${solr.autoCommit.maxTime:-1}</maxTime>
<openSearcher>false</openSearcher>
</autoCommit>
<autoSoftCommit>
<maxTime>${solr.autoSoftCommit.maxTime:-1}</maxTime>
</autoSoftCommit>
</updateHandler>
<updateRequestProcessorChain name="dedupe">

View File

@ -59,6 +59,7 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
@BeforeClass
public static void beforeSuperClass() {
schemaString = "schema15.xml"; // we need a string id
System.setProperty("solr.autoCommit.maxTime", "15000");
SolrCmdDistributor.testing_errorHook = new Diagnostics.Callable() {
@Override
public void call(Object... data) {
@ -73,6 +74,7 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
@AfterClass
public static void afterSuperClass() {
System.clearProperty("solr.autoCommit.maxTime");
SolrCmdDistributor.testing_errorHook = null;
}

View File

@ -40,6 +40,7 @@ public class ChaosMonkeySafeLeaderTest extends AbstractFullDistribZkTestBase {
@BeforeClass
public static void beforeSuperClass() {
schemaString = "schema15.xml"; // we need a string id
System.setProperty("solr.autoCommit.maxTime", "15000");
SolrCmdDistributor.testing_errorHook = new Diagnostics.Callable() {
@Override
public void call(Object... data) {
@ -54,6 +55,7 @@ public class ChaosMonkeySafeLeaderTest extends AbstractFullDistribZkTestBase {
@AfterClass
public static void afterSuperClass() {
System.clearProperty("solr.autoCommit.maxTime");
SolrCmdDistributor.testing_errorHook = null;
}

View File

@ -21,24 +21,32 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.lucene.util.LuceneTestCase.Nightly;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4.SuppressObjectReleaseTracker;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.util.TestInjection;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
@Slow
@Nightly
@SuppressObjectReleaseTracker(bugUrl="this is a purposely leaky test")
public class RestartWhileUpdatingTest extends AbstractFullDistribZkTestBase {
//private static final String DISTRIB_UPDATE_CHAIN = "distrib-update-chain";
private List<StoppableIndexingThread> threads;
private volatile boolean stopExpire = false;
public RestartWhileUpdatingTest() {
public RestartWhileUpdatingTest() throws Exception {
super();
sliceCount = 1;
fixShardCount(3);
schemaString = "schema15.xml"; // we need a string id
useFactory("solr.StandardDirectoryFactory");
}
public static String[] fieldNames = new String[]{"f_i", "f_f", "f_d", "f_l", "f_dt"};
@ -51,6 +59,23 @@ public class RestartWhileUpdatingTest extends AbstractFullDistribZkTestBase {
protected RandVal[] getRandValues() {
return randVals;
}
@BeforeClass
public static void beforeRestartWhileUpdatingTest() {
System.setProperty("leaderVoteWait", "300000");
System.setProperty("solr.autoCommit.maxTime", "30000");
System.setProperty("solr.autoSoftCommit.maxTime", "3000");
TestInjection.nonGracefullClose = "true:60";
TestInjection.failReplicaRequests = "true:03";
}
@AfterClass
public static void afterRestartWhileUpdatingTest() {
System.clearProperty("leaderVoteWait");
System.clearProperty("solr.autoCommit.maxTime");
System.clearProperty("solr.autoSoftCommit.maxTime");
TestInjection.reset();
}
@Test
public void test() throws Exception {
@ -66,7 +91,30 @@ public class RestartWhileUpdatingTest extends AbstractFullDistribZkTestBase {
int numThreads = random().nextInt(4) + 1;
threads = new ArrayList<>(2);
threads = new ArrayList<>(numThreads);
Thread expireThread = new Thread() {
public void run() {
while (!stopExpire) {
try {
Thread.sleep(random().nextInt(15000));
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
// try {
// chaosMonkey.expireRandomSession();
// } catch (KeeperException e) {
// throw new RuntimeException(e);
// } catch (InterruptedException e) {
// throw new RuntimeException(e);
// }
}
}
};
// Currently unused
// expireThread.start();
StoppableIndexingThread indexThread;
for (int i = 0; i < numThreads; i++) {
@ -77,9 +125,11 @@ public class RestartWhileUpdatingTest extends AbstractFullDistribZkTestBase {
Thread.sleep(2000);
int restartTimes = random().nextInt(4) + 1;;
int restartTimes = 1;//random().nextInt(4) + 1;;
for (int i = 0; i < restartTimes; i++) {
Thread.sleep(random().nextInt(300000));
stopAndStartAllReplicas();
Thread.sleep(random().nextInt(30000));
}
Thread.sleep(2000);
@ -87,12 +137,13 @@ public class RestartWhileUpdatingTest extends AbstractFullDistribZkTestBase {
// stop indexing threads
for (StoppableIndexingThread thread : threads) {
thread.safeStop();
thread.safeStop();
}
stopExpire = true;
expireThread.join();
Thread.sleep(1000);
waitForThingsToLevelOut(120);
waitForThingsToLevelOut(320);
Thread.sleep(2000);
@ -107,8 +158,13 @@ public class RestartWhileUpdatingTest extends AbstractFullDistribZkTestBase {
}
public void stopAndStartAllReplicas() throws Exception, InterruptedException {
chaosMonkey.stopAll(random().nextInt(2000));
chaosMonkey.stopAll(random().nextInt(1));
if (random().nextBoolean()) {
for (StoppableIndexingThread thread : threads) {
thread.safeStop();
}
}
Thread.sleep(1000);
chaosMonkey.startAll();

View File

@ -0,0 +1,63 @@
package org.apache.solr.cloud.hdfs;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.cloud.RestartWhileUpdatingTest;
import org.apache.solr.util.BadHdfsThreadsFilter;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import com.carrotsearch.randomizedtesting.annotations.Nightly;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
@Slow
@Nightly
@ThreadLeakFilters(defaultFilters = true, filters = {
BadHdfsThreadsFilter.class // hdfs currently leaks thread(s)
})
public class HdfsRestartWhileUpdatingTest extends RestartWhileUpdatingTest {
public HdfsRestartWhileUpdatingTest() throws Exception {
super();
}
private static MiniDFSCluster dfsCluster;
@BeforeClass
public static void setupClass() throws Exception {
dfsCluster = HdfsTestUtil.setupClass(createTempDir().toFile().getAbsolutePath());
System.setProperty("solr.hdfs.blockcache.blocksperbank", "2048");
}
@AfterClass
public static void teardownClass() throws Exception {
HdfsTestUtil.teardownClass(dfsCluster);
dfsCluster = null;
}
@Override
protected String getDataDir(String dataDir) throws IOException {
return HdfsTestUtil.getDataDir(dfsCluster, dataDir);
}
}

View File

@ -0,0 +1,102 @@
package org.apache.solr.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Locale;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class TestTestInjection extends LuceneTestCase {
@BeforeClass
public static void beforeClass() {
}
@AfterClass
public static void cleanup() {
}
public void testBasics() {
TestInjection.failReplicaRequests = "true:100";
try {
TestInjection.injectFailReplicaRequests();
fail("should fail 100%");
} catch (Throwable e) {
assertFalse("Should not fail based on bad syntax",
e.getMessage().toLowerCase(Locale.ENGLISH).contains("bad syntax"));
// good
// assertTrue("Should fail with * based error: " + e.getClass().getName(), (e instanceof *));
}
TestInjection.failReplicaRequests = "true:00";
for (int i = 0; i < 100; i++) {
// should never fail
TestInjection.injectFailReplicaRequests();
}
}
public void testBadSyntax() {
testBadSyntax("true/10");
testBadSyntax("boo:100");
testBadSyntax("false:100f");
testBadSyntax("TRUE:0:");
}
public void testGoodSyntax() {
testGoodSyntax("true:10");
testGoodSyntax("true:100");
testGoodSyntax("false:100");
testGoodSyntax("TRUE:0");
testGoodSyntax("TRUE:00");
testGoodSyntax("TRUE:000");
testGoodSyntax("FALSE:50");
testGoodSyntax("FAlsE:99");
}
public void testBadSyntax(String syntax) {
TestInjection.failReplicaRequests = syntax;
try {
TestInjection.injectFailReplicaRequests();
fail("should fail 100%");
} catch (Exception e) {
assertTrue(e.getMessage().toLowerCase(Locale.ENGLISH).contains("bad syntax"));
// good
}
}
public void testGoodSyntax(String syntax) {
TestInjection.failReplicaRequests = syntax;
try {
TestInjection.injectFailReplicaRequests();
} catch (Exception e) {
// we can fail, but should not be for bad syntax
assertFalse(e.getMessage().toLowerCase(Locale.ENGLISH).contains("bad syntax"));
}
}
}

View File

@ -17,15 +17,24 @@ package org.apache.solr.common.util;
* limitations under the License.
*/
import java.io.Closeable;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.HashSet;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ObjectReleaseTracker {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static Map<Object,String> OBJECTS = new ConcurrentHashMap<>();
public static boolean track(Object object) {
@ -41,15 +50,30 @@ public class ObjectReleaseTracker {
return true;
}
public static void clear() {
OBJECTS.clear();
}
/**
* @return null if ok else error message
*/
public static String clearObjectTrackerAndCheckEmpty() {
String result = checkEmpty();
OBJECTS.clear();
return result;
}
/**
* @return null if ok else error message
*/
public static String checkEmpty() {
String error = null;
Set<Entry<Object,String>> entries = OBJECTS.entrySet();
boolean empty = entries.isEmpty();
if (entries.size() > 0) {
Set<String> objects = new HashSet<>();
List<String> objects = new ArrayList<>();
for (Entry<Object,String> entry : entries) {
objects.add(entry.getKey().getClass().getSimpleName());
}
@ -62,12 +86,33 @@ public class ObjectReleaseTracker {
}
}
OBJECTS.clear();
return error;
}
public static void tryClose() {
Set<Entry<Object,String>> entries = OBJECTS.entrySet();
if (entries.size() > 0) {
for (Entry<Object,String> entry : entries) {
if (entry.getKey() instanceof Closeable) {
try {
((Closeable)entry.getKey()).close();
} catch (Throwable t) {
log.error("", t);
}
} else if (entry.getKey() instanceof ExecutorService) {
try {
ExecutorUtil.shutdownAndAwaitTermination((ExecutorService)entry.getKey());
} catch (Throwable t) {
log.error("", t);
}
}
}
}
}
private static class ObjectTrackerException extends RuntimeException {
}
}

View File

@ -963,7 +963,11 @@ public abstract class BaseDistributedSearchTestCase extends SolrTestCaseJ4 {
createServers(numShards);
RandVal.uniqueValues = new HashSet(); //reset random values
statement.evaluate();
destroyServers();
try {
destroyServers();
} catch (Throwable t) {
log.error("Error while shutting down servers", t);
}
}
}

View File

@ -222,20 +222,24 @@ public abstract class SolrTestCaseJ4 extends LuceneTestCase {
if (suiteFailureMarker.wasSuccessful()) {
// if the tests passed, make sure everything was closed / released
endTrackingSearchers();
String orr = ObjectReleaseTracker.clearObjectTrackerAndCheckEmpty();
if (!RandomizedContext.current().getTargetClass().isAnnotationPresent(SuppressObjectReleaseTracker.class)) {
String orr = ObjectReleaseTracker.clearObjectTrackerAndCheckEmpty();
endTrackingSearchers(120, true);
assertNull(orr, orr);
} else {
endTrackingSearchers(15, false);
String orr = ObjectReleaseTracker.checkEmpty();
if (orr != null) {
log.warn(
"Some resources were not closed, shutdown, or released. This has been ignored due to the SuppressObjectReleaseTracker annotation.");
"Some resources were not closed, shutdown, or released. This has been ignored due to the SuppressObjectReleaseTracker annotation, trying to close them now.");
ObjectReleaseTracker.tryClose();
}
}
}
resetFactory();
coreName = DEFAULT_TEST_CORENAME;
} finally {
ObjectReleaseTracker.clear();
initCoreDataDir = null;
System.clearProperty("zookeeper.forceSync");
System.clearProperty("jetty.testMode");
@ -423,14 +427,14 @@ public abstract class SolrTestCaseJ4 extends LuceneTestCase {
}
}
public static void endTrackingSearchers() {
public static void endTrackingSearchers(int waitSeconds, boolean failTest) {
long endNumOpens = SolrIndexSearcher.numOpens.get();
long endNumCloses = SolrIndexSearcher.numCloses.get();
// wait a bit in case any ending threads have anything to release
int retries = 0;
while (endNumOpens - numOpens != endNumCloses - numCloses) {
if (retries++ > 120) {
if (retries++ > waitSeconds) {
break;
}
try {
@ -450,7 +454,7 @@ public abstract class SolrTestCaseJ4 extends LuceneTestCase {
if ("TestReplicationHandler".equals(RandomizedContext.current().getTargetClass().getSimpleName())) {
log.warn("TestReplicationHandler wants to fail!: " + msg);
} else {
fail(msg);
if (failTest) fail(msg);
}
}
}

View File

@ -158,7 +158,7 @@ public abstract class AbstractDistribZkTestBase extends BaseDistributedSearchTes
+ " live:"
+ clusterState.liveNodesContain(shard.getValue().getNodeName()));
final Replica.State state = shard.getValue().getState();
if ((state == Replica.State.RECOVERING || state == Replica.State.DOWN)
if ((state == Replica.State.RECOVERING || state == Replica.State.DOWN || state == Replica.State.RECOVERY_FAILED)
&& clusterState.liveNodesContain(shard.getValue().getStr(ZkStateReader.NODE_NAME_PROP))) {
sawLiveRecovering = true;
}

View File

@ -21,6 +21,7 @@ import java.lang.invoke.MethodHandles;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
@ -185,9 +186,22 @@ public class ChaosMonkey {
assert(jetty != null);
monkeyLog("stop shard! " + jetty.getLocalPort());
SolrDispatchFilter sdf = jetty.getSolrDispatchFilter();
if (sdf != null)
sdf.destroy();
jetty.stop();
if (sdf != null) {
try {
sdf.destroy();
} catch (Throwable t) {
log.error("", t);
}
}
try {
jetty.stop();
} catch (InterruptedException e) {
log.info("Jetty stop interrupted - should be a test caused interruption, we will try again to be sure we shutdown");
}
if (!jetty.isStopped()) {
jetty.stop();
}
if (!jetty.isStopped()) {
throw new RuntimeException("could not stop jetty");
@ -231,13 +245,28 @@ public class ChaosMonkey {
public void stopAll(int pauseBetweenMs) throws Exception {
Set<String> keys = shardToJetty.keySet();
List<Thread> jettyThreads = new ArrayList<>(keys.size());
for (String key : keys) {
List<CloudJettyRunner> jetties = shardToJetty.get(key);
for (CloudJettyRunner jetty : jetties) {
Thread.sleep(pauseBetweenMs);
stopJetty(jetty);
Thread thread = new Thread() {
public void run() {
try {
stopJetty(jetty);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
};
jettyThreads.add(thread);
thread.start();
}
}
for (Thread thread : jettyThreads) {
thread.join();
}
}
public void startAll() throws Exception {
@ -359,12 +388,17 @@ public class ChaosMonkey {
// cluster state can be stale - also go by our 'near real-time' is leader prop
boolean rtIsLeader;
try (SolrCore core = cjetty.jetty.getCoreContainer().getCore(leader.getStr(ZkStateReader.CORE_NAME_PROP))) {
if (core == null) {
monkeyLog("selected jetty not running correctly - skip");
return null;
CoreContainer cc = cjetty.jetty.getCoreContainer();
if (cc != null) {
try (SolrCore core = cc.getCore(leader.getStr(ZkStateReader.CORE_NAME_PROP))) {
if (core == null) {
monkeyLog("selected jetty not running correctly - skip");
return null;
}
rtIsLeader = core.getCoreDescriptor().getCloudDescriptor().isLeader();
}
rtIsLeader = core.getCoreDescriptor().getCloudDescriptor().isLeader();
} else {
return null;
}
boolean isLeader = leader.getStr(ZkStateReader.NODE_NAME_PROP).equals(jetties.get(index).nodeName)