Detect nodes being blocked by GC-disrupted node (#21797)
The disruption type LongGCDisruption simulates GCs on a node by suspending all the threads of that node. If the suspended threads are in a code section with shared JVM locks, however, it can prevent the other nodes from doing their thing. The class LongGCDisruption has a list of class names for which we know that this can occur. Whenever a test using the GC disruption type fails in mysterious ways, it becomes a long guessing game to find the offending class. This commit adds code to LongGCDisruption to automatically detect these situations, fail the test early and report the offending class and all relevant context.
This commit is contained in:
parent
db13b4c2bc
commit
7e198f0e41
|
@ -628,7 +628,7 @@ public class DiscoveryWithServiceDisruptionsIT extends ESIntegTestCase {
|
|||
|
||||
String oldMasterNode = internalCluster().getMasterName();
|
||||
// a very long GC, but it's OK as we remove the disruption when it has had an effect
|
||||
SingleNodeDisruption masterNodeDisruption = new IntermittentLongGCDisruption(oldMasterNode, random(), 100, 200, 30000, 60000);
|
||||
SingleNodeDisruption masterNodeDisruption = new IntermittentLongGCDisruption(random(), oldMasterNode, 100, 200, 30000, 60000);
|
||||
internalCluster().setDisruptionScheme(masterNodeDisruption);
|
||||
masterNodeDisruption.startDisrupting();
|
||||
|
||||
|
|
|
@ -39,19 +39,6 @@ public class IntermittentLongGCDisruption extends LongGCDisruption {
|
|||
final long delayDurationMax;
|
||||
|
||||
|
||||
public IntermittentLongGCDisruption(Random random) {
|
||||
this(null, random);
|
||||
}
|
||||
|
||||
public IntermittentLongGCDisruption(String disruptedNode, Random random) {
|
||||
this(disruptedNode, random, 100, 200, 300, 20000);
|
||||
}
|
||||
|
||||
public IntermittentLongGCDisruption(String disruptedNode, Random random, long intervalBetweenDelaysMin,
|
||||
long intervalBetweenDelaysMax, long delayDurationMin, long delayDurationMax) {
|
||||
this(random, disruptedNode, intervalBetweenDelaysMin, intervalBetweenDelaysMax, delayDurationMin, delayDurationMax);
|
||||
}
|
||||
|
||||
public IntermittentLongGCDisruption(Random random, String disruptedNode, long intervalBetweenDelaysMin, long intervalBetweenDelaysMax,
|
||||
long delayDurationMin, long delayDurationMax) {
|
||||
super(random, disruptedNode);
|
||||
|
@ -88,19 +75,15 @@ public class IntermittentLongGCDisruption extends LongGCDisruption {
|
|||
}
|
||||
|
||||
private void simulateLongGC(final TimeValue duration) throws InterruptedException {
|
||||
final String disruptionNodeCopy = disruptedNode;
|
||||
if (disruptionNodeCopy == null) {
|
||||
return;
|
||||
}
|
||||
logger.info("node [{}] goes into GC for for [{}]", disruptionNodeCopy, duration);
|
||||
logger.info("node [{}] goes into GC for for [{}]", disruptedNode, duration);
|
||||
final Set<Thread> nodeThreads = new HashSet<>();
|
||||
try {
|
||||
while (stopNodeThreads(disruptionNodeCopy, nodeThreads)) ;
|
||||
while (stopNodeThreads(nodeThreads)) ;
|
||||
if (!nodeThreads.isEmpty()) {
|
||||
Thread.sleep(duration.millis());
|
||||
}
|
||||
} finally {
|
||||
logger.info("node [{}] resumes from GC", disruptionNodeCopy);
|
||||
logger.info("node [{}] resumes from GC", disruptedNode);
|
||||
resumeThreads(nodeThreads);
|
||||
}
|
||||
}
|
||||
|
@ -109,13 +92,13 @@ public class IntermittentLongGCDisruption extends LongGCDisruption {
|
|||
|
||||
@Override
|
||||
public void run() {
|
||||
while (disrupting && disruptedNode != null) {
|
||||
while (disrupting) {
|
||||
try {
|
||||
TimeValue duration = new TimeValue(delayDurationMin + random.nextInt((int) (delayDurationMax - delayDurationMin)));
|
||||
simulateLongGC(duration);
|
||||
|
||||
duration = new TimeValue(intervalBetweenDelaysMin + random.nextInt((int) (intervalBetweenDelaysMax - intervalBetweenDelaysMin)));
|
||||
if (disrupting && disruptedNode != null) {
|
||||
if (disrupting) {
|
||||
Thread.sleep(duration.millis());
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
|
|
|
@ -19,11 +19,15 @@
|
|||
|
||||
package org.elasticsearch.test.disruption;
|
||||
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.SuppressForbidden;
|
||||
import org.elasticsearch.common.unit.TimeValue;
|
||||
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
|
||||
import org.elasticsearch.test.InternalTestCluster;
|
||||
|
||||
import java.lang.management.ManagementFactory;
|
||||
import java.lang.management.ThreadInfo;
|
||||
import java.lang.management.ThreadMXBean;
|
||||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
|
@ -41,11 +45,16 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
// logging has shared JVM locks - we may suspend a thread and block other nodes from doing their thing
|
||||
Pattern.compile("logging\\.log4j"),
|
||||
// security manager is shared across all nodes AND it uses synced hashmaps interanlly
|
||||
Pattern.compile("java\\.lang\\.SecurityManager")
|
||||
Pattern.compile("java\\.lang\\.SecurityManager"),
|
||||
// SecureRandom instance from SecureRandomHolder class is shared by all nodes
|
||||
Pattern.compile("java\\.security\\.SecureRandom")
|
||||
};
|
||||
|
||||
private static final ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();
|
||||
|
||||
protected final String disruptedNode;
|
||||
private Set<Thread> suspendedThreads;
|
||||
private Thread blockDetectionThread;
|
||||
|
||||
public LongGCDisruption(Random random, String disruptedNode) {
|
||||
super(random);
|
||||
|
@ -60,7 +69,7 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
suspendedThreads = ConcurrentHashMap.newKeySet();
|
||||
|
||||
final String currentThreadName = Thread.currentThread().getName();
|
||||
assert currentThreadName.contains("[" + disruptedNode + "]") == false :
|
||||
assert isDisruptedNodeThread(currentThreadName) == false :
|
||||
"current thread match pattern. thread name: " + currentThreadName + ", node: " + disruptedNode;
|
||||
// we spawn a background thread to protect against deadlock which can happen
|
||||
// if there are shared resources between caller thread and and suspended threads
|
||||
|
@ -75,7 +84,7 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
@Override
|
||||
protected void doRun() throws Exception {
|
||||
// keep trying to stop threads, until no new threads are discovered.
|
||||
while (stopNodeThreads(disruptedNode, suspendedThreads)) {
|
||||
while (stopNodeThreads(suspendedThreads)) {
|
||||
if (Thread.interrupted()) {
|
||||
return;
|
||||
}
|
||||
|
@ -95,13 +104,52 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
}
|
||||
if (stoppingThread.isAlive()) {
|
||||
logger.warn("failed to stop node [{}]'s threads within [{}] millis. Stopping thread stack trace:\n {}"
|
||||
, disruptedNode, getStoppingTimeoutInMillis(), stackTrace(stoppingThread));
|
||||
, disruptedNode, getStoppingTimeoutInMillis(), stackTrace(stoppingThread.getStackTrace()));
|
||||
stoppingThread.interrupt(); // best effort;
|
||||
throw new RuntimeException("stopping node threads took too long");
|
||||
}
|
||||
// block detection checks if other threads are blocked waiting on an object that is held by one
|
||||
// of the threads that was suspended
|
||||
if (isBlockDetectionSupported()) {
|
||||
blockDetectionThread = new Thread(new AbstractRunnable() {
|
||||
@Override
|
||||
public void onFailure(Exception e) {
|
||||
if (e instanceof InterruptedException == false) {
|
||||
throw new AssertionError("unexpected exception in blockDetectionThread", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doRun() throws Exception {
|
||||
while (Thread.currentThread().isInterrupted() == false) {
|
||||
ThreadInfo[] threadInfos = threadBean.dumpAllThreads(true, true);
|
||||
for (ThreadInfo threadInfo : threadInfos) {
|
||||
if (isDisruptedNodeThread(threadInfo.getThreadName()) == false &&
|
||||
threadInfo.getLockOwnerName() != null &&
|
||||
isDisruptedNodeThread(threadInfo.getLockOwnerName())) {
|
||||
|
||||
// find ThreadInfo object of the blocking thread (if available)
|
||||
ThreadInfo blockingThreadInfo = null;
|
||||
for (ThreadInfo otherThreadInfo : threadInfos) {
|
||||
if (otherThreadInfo.getThreadId() == threadInfo.getLockOwnerId()) {
|
||||
blockingThreadInfo = otherThreadInfo;
|
||||
break;
|
||||
}
|
||||
}
|
||||
onBlockDetected(threadInfo, blockingThreadInfo);
|
||||
}
|
||||
}
|
||||
Thread.sleep(getBlockDetectionIntervalInMillis());
|
||||
}
|
||||
}
|
||||
});
|
||||
blockDetectionThread.setName(currentThreadName + "[LongGCDisruption][blockDetection]");
|
||||
blockDetectionThread.start();
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success == false) {
|
||||
stopBlockDetection();
|
||||
// resume threads if failed
|
||||
resumeThreads(suspendedThreads);
|
||||
suspendedThreads = null;
|
||||
|
@ -112,18 +160,35 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
}
|
||||
}
|
||||
|
||||
private String stackTrace(Thread thread) {
|
||||
return Arrays.stream(thread.getStackTrace()).map(Object::toString).collect(Collectors.joining("\n"));
|
||||
public boolean isDisruptedNodeThread(String threadName) {
|
||||
return threadName.contains("[" + disruptedNode + "]");
|
||||
}
|
||||
|
||||
private String stackTrace(StackTraceElement[] stackTraceElements) {
|
||||
return Arrays.stream(stackTraceElements).map(Object::toString).collect(Collectors.joining("\n"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void stopDisrupting() {
|
||||
stopBlockDetection();
|
||||
if (suspendedThreads != null) {
|
||||
resumeThreads(suspendedThreads);
|
||||
suspendedThreads = null;
|
||||
}
|
||||
}
|
||||
|
||||
private void stopBlockDetection() {
|
||||
if (blockDetectionThread != null) {
|
||||
try {
|
||||
blockDetectionThread.interrupt(); // best effort
|
||||
blockDetectionThread.join(getStoppingTimeoutInMillis());
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
blockDetectionThread = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeAndEnsureHealthy(InternalTestCluster cluster) {
|
||||
removeFromCluster(cluster);
|
||||
|
@ -144,7 +209,7 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
*/
|
||||
@SuppressWarnings("deprecation") // stops/resumes threads intentionally
|
||||
@SuppressForbidden(reason = "stops/resumes threads intentionally")
|
||||
protected boolean stopNodeThreads(String node, Set<Thread> nodeThreads) {
|
||||
protected boolean stopNodeThreads(Set<Thread> nodeThreads) {
|
||||
Thread[] allThreads = null;
|
||||
while (allThreads == null) {
|
||||
allThreads = new Thread[Thread.activeCount()];
|
||||
|
@ -154,16 +219,15 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
}
|
||||
}
|
||||
boolean liveThreadsFound = false;
|
||||
final String nodeThreadNamePart = "[" + node + "]";
|
||||
for (Thread thread : allThreads) {
|
||||
if (thread == null) {
|
||||
continue;
|
||||
}
|
||||
String name = thread.getName();
|
||||
if (name.contains(nodeThreadNamePart)) {
|
||||
String threadName = thread.getName();
|
||||
if (isDisruptedNodeThread(threadName)) {
|
||||
if (thread.isAlive() && nodeThreads.add(thread)) {
|
||||
liveThreadsFound = true;
|
||||
logger.trace("stopping thread [{}]", name);
|
||||
logger.trace("stopping thread [{}]", threadName);
|
||||
thread.suspend();
|
||||
// double check the thread is not in a shared resource like logging. If so, let it go and come back..
|
||||
boolean safe = true;
|
||||
|
@ -178,7 +242,7 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
}
|
||||
}
|
||||
if (!safe) {
|
||||
logger.trace("resuming thread [{}] as it is in a critical section", name);
|
||||
logger.trace("resuming thread [{}] as it is in a critical section", threadName);
|
||||
thread.resume();
|
||||
nodeThreads.remove(thread);
|
||||
}
|
||||
|
@ -198,6 +262,28 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
return TimeValue.timeValueSeconds(30).getMillis();
|
||||
}
|
||||
|
||||
public boolean isBlockDetectionSupported() {
|
||||
return threadBean.isObjectMonitorUsageSupported() && threadBean.isSynchronizerUsageSupported();
|
||||
}
|
||||
|
||||
// for testing
|
||||
protected long getBlockDetectionIntervalInMillis() {
|
||||
return 3000L;
|
||||
}
|
||||
|
||||
// for testing
|
||||
protected void onBlockDetected(ThreadInfo blockedThread, @Nullable ThreadInfo blockingThread) {
|
||||
String blockedThreadStackTrace = stackTrace(blockedThread.getStackTrace());
|
||||
String blockingThreadStackTrace = blockingThread != null ?
|
||||
stackTrace(blockingThread.getStackTrace()) : "not available";
|
||||
throw new AssertionError("Thread [" + blockedThread.getThreadName() + "] is blocked waiting on the resource [" +
|
||||
blockedThread.getLockInfo() + "] held by the suspended thread [" + blockedThread.getLockOwnerName() +
|
||||
"] of the disrupted node [" + disruptedNode + "].\n" +
|
||||
"Please add this occurrence to the unsafeClasses list in [" + LongGCDisruption.class.getName() + "].\n" +
|
||||
"Stack trace of blocked thread: " + blockedThreadStackTrace + "\n" +
|
||||
"Stack trace of blocking thread: " + blockingThreadStackTrace);
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation") // stops/resumes threads intentionally
|
||||
@SuppressForbidden(reason = "stops/resumes threads intentionally")
|
||||
protected void resumeThreads(Set<Thread> threads) {
|
||||
|
|
|
@ -18,11 +18,15 @@
|
|||
*/
|
||||
package org.elasticsearch.test.disruption;
|
||||
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.lang.management.ThreadInfo;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.concurrent.locks.ReentrantLock;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -148,4 +152,94 @@ public class LongGCDisruptionTest extends ESTestCase {
|
|||
stop.set(true);
|
||||
}
|
||||
}
|
||||
|
||||
public void testBlockDetection() throws Exception {
|
||||
final String disruptedNodeName = "disrupted_node";
|
||||
final String blockedNodeName = "blocked_node";
|
||||
CountDownLatch waitForBlockDetectionResult = new CountDownLatch(1);
|
||||
AtomicReference<ThreadInfo> blockDetectionResult = new AtomicReference<>();
|
||||
LongGCDisruption disruption = new LongGCDisruption(random(), disruptedNodeName) {
|
||||
@Override
|
||||
protected Pattern[] getUnsafeClasses() {
|
||||
return new Pattern[0];
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onBlockDetected(ThreadInfo blockedThread, @Nullable ThreadInfo blockingThread) {
|
||||
blockDetectionResult.set(blockedThread);
|
||||
waitForBlockDetectionResult.countDown();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected long getBlockDetectionIntervalInMillis() {
|
||||
return 10L;
|
||||
}
|
||||
};
|
||||
if (disruption.isBlockDetectionSupported() == false) {
|
||||
return;
|
||||
}
|
||||
final AtomicBoolean stop = new AtomicBoolean();
|
||||
final CountDownLatch underLock = new CountDownLatch(1);
|
||||
final CountDownLatch pauseUnderLock = new CountDownLatch(1);
|
||||
final LockedExecutor lockedExecutor = new LockedExecutor();
|
||||
final AtomicLong ops = new AtomicLong();
|
||||
try {
|
||||
for (int i = 0; i < 5; i++) {
|
||||
// at least one locked and one none lock thread
|
||||
final boolean lockedExec = (i < 4 && randomBoolean()) || i == 0;
|
||||
Thread thread = new Thread(() -> {
|
||||
while (stop.get() == false) {
|
||||
if (lockedExec) {
|
||||
lockedExecutor.executeLocked(() -> {
|
||||
try {
|
||||
underLock.countDown();
|
||||
ops.incrementAndGet();
|
||||
pauseUnderLock.await();
|
||||
} catch (InterruptedException e) {
|
||||
|
||||
}
|
||||
});
|
||||
} else {
|
||||
ops.incrementAndGet();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
thread.setName("[" + disruptedNodeName + "][" + i + "]");
|
||||
thread.start();
|
||||
}
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
// at least one locked and one none lock thread
|
||||
final boolean lockedExec = (i < 4 && randomBoolean()) || i == 0;
|
||||
Thread thread = new Thread(() -> {
|
||||
while (stop.get() == false) {
|
||||
if (lockedExec) {
|
||||
lockedExecutor.executeLocked(() -> {
|
||||
ops.incrementAndGet();
|
||||
});
|
||||
} else {
|
||||
ops.incrementAndGet();
|
||||
}
|
||||
}
|
||||
});
|
||||
thread.setName("[" + blockedNodeName + "][" + i + "]");
|
||||
thread.start();
|
||||
}
|
||||
// make sure some threads of test_node are under lock
|
||||
underLock.await();
|
||||
disruption.startDisrupting();
|
||||
waitForBlockDetectionResult.await(30, TimeUnit.SECONDS);
|
||||
disruption.stopDisrupting();
|
||||
|
||||
ThreadInfo threadInfo = blockDetectionResult.get();
|
||||
assertNotNull(threadInfo);
|
||||
assertThat(threadInfo.getThreadName(), containsString("[" + blockedNodeName + "]"));
|
||||
assertThat(threadInfo.getLockOwnerName(), containsString("[" + disruptedNodeName + "]"));
|
||||
assertThat(threadInfo.getLockInfo().getClassName(), containsString(ReentrantLock.class.getName()));
|
||||
} finally {
|
||||
stop.set(true);
|
||||
pauseUnderLock.countDown();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue