Work around JVM Bug in LongGCDisruptionTests (#50731) (#50974)

There is a JVM bug causing `Thread#suspend` calls to randomly take
multiple seconds breaking these tests that call the method numerous times
in a loop. Increasing the timeout would will not work since we may call
`suspend` tens if not hundreds of times and even a small number of them
experiencing the blocking will lead to multiple minutes of waiting.

This PR detects the specific issue by timing the `Thread#suspend` calls and
skips the remainder of the test if it timed out because of the JVM bug.

Closes #50047
This commit is contained in:
Armin Braun 2020-01-14 17:13:21 +01:00 committed by GitHub
parent d8510be3d9
commit 6e8ea7aaa2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 4 deletions

View File

@ -32,6 +32,8 @@ import java.util.Arrays;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@ -56,11 +58,23 @@ public class LongGCDisruption extends SingleNodeDisruption {
private Set<Thread> suspendedThreads;
private Thread blockDetectionThread;
private final AtomicBoolean sawSlowSuspendBug = new AtomicBoolean(false);
public LongGCDisruption(Random random, String disruptedNode) {
super(random);
this.disruptedNode = disruptedNode;
}
/**
* Checks if during disruption we ran into a known JVM issue that makes {@link Thread#suspend()} calls block for multiple seconds
* was observed.
* @see <a href=https://bugs.openjdk.java.net/browse/JDK-8218446>JDK-8218446</a>
* @return true if during thread suspending a call to {@link Thread#suspend()} took more than 3s
*/
public boolean sawSlowSuspendBug() {
return sawSlowSuspendBug.get();
}
@Override
public synchronized void startDisrupting() {
if (suspendedThreads == null) {
@ -251,7 +265,11 @@ public class LongGCDisruption extends SingleNodeDisruption {
* assuming that it is safe.
*/
boolean definitelySafe = true;
final long startTime = System.nanoTime();
thread.suspend();
if (System.nanoTime() - startTime > TimeUnit.SECONDS.toNanos(3L)) {
sawSlowSuspendBug.set(true);
}
// double check the thread is not in a shared resource like logging; if so, let it go and come back
safe:
for (StackTraceElement stackElement : thread.getStackTrace()) {

View File

@ -18,7 +18,6 @@
*/
package org.elasticsearch.test.disruption;
import org.elasticsearch.bootstrap.JavaVersion;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.test.ESTestCase;
@ -115,8 +114,6 @@ public class LongGCDisruptionTests extends ESTestCase {
* but does keep retrying until all threads can be safely paused
*/
public void testNotBlockingUnsafeStackTraces() throws Exception {
assumeFalse("https://github.com/elastic/elasticsearch/issues/50047",
JavaVersion.current().equals(JavaVersion.parse("11")) || JavaVersion.current().equals(JavaVersion.parse("12")));
final String nodeName = "test_node";
LongGCDisruption disruption = new LongGCDisruption(random(), nodeName) {
@Override
@ -149,7 +146,14 @@ public class LongGCDisruptionTests extends ESTestCase {
threads[i].start();
}
// make sure some threads are under lock
disruption.startDisrupting();
try {
disruption.startDisrupting();
} catch (RuntimeException e) {
if (e.getMessage().contains("suspending node threads took too long") && disruption.sawSlowSuspendBug()) {
return;
}
throw new AssertionError(e);
}
long first = ops.get();
assertThat(lockedExecutor.lock.isLocked(), equalTo(false)); // no threads should own the lock
Thread.sleep(100);
@ -157,6 +161,7 @@ public class LongGCDisruptionTests extends ESTestCase {
disruption.stopDisrupting();
assertBusy(() -> assertThat(ops.get(), greaterThan(first)));
} finally {
disruption.stopDisrupting();
stop.set(true);
for (final Thread thread : threads) {
thread.join();