There is a JVM bug causing `Thread#suspend` calls to randomly take multiple seconds breaking these tests that call the method numerous times in a loop. Increasing the timeout would will not work since we may call `suspend` tens if not hundreds of times and even a small number of them experiencing the blocking will lead to multiple minutes of waiting. This PR detects the specific issue by timing the `Thread#suspend` calls and skips the remainder of the test if it timed out because of the JVM bug. Closes #50047
This commit is contained in:
parent
d8510be3d9
commit
6e8ea7aaa2
|
@ -32,6 +32,8 @@ import java.util.Arrays;
|
|||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -56,11 +58,23 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
private Set<Thread> suspendedThreads;
|
||||
private Thread blockDetectionThread;
|
||||
|
||||
private final AtomicBoolean sawSlowSuspendBug = new AtomicBoolean(false);
|
||||
|
||||
public LongGCDisruption(Random random, String disruptedNode) {
|
||||
super(random);
|
||||
this.disruptedNode = disruptedNode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if during disruption we ran into a known JVM issue that makes {@link Thread#suspend()} calls block for multiple seconds
|
||||
* was observed.
|
||||
* @see <a href=https://bugs.openjdk.java.net/browse/JDK-8218446>JDK-8218446</a>
|
||||
* @return true if during thread suspending a call to {@link Thread#suspend()} took more than 3s
|
||||
*/
|
||||
public boolean sawSlowSuspendBug() {
|
||||
return sawSlowSuspendBug.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void startDisrupting() {
|
||||
if (suspendedThreads == null) {
|
||||
|
@ -251,7 +265,11 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
|||
* assuming that it is safe.
|
||||
*/
|
||||
boolean definitelySafe = true;
|
||||
final long startTime = System.nanoTime();
|
||||
thread.suspend();
|
||||
if (System.nanoTime() - startTime > TimeUnit.SECONDS.toNanos(3L)) {
|
||||
sawSlowSuspendBug.set(true);
|
||||
}
|
||||
// double check the thread is not in a shared resource like logging; if so, let it go and come back
|
||||
safe:
|
||||
for (StackTraceElement stackElement : thread.getStackTrace()) {
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
*/
|
||||
package org.elasticsearch.test.disruption;
|
||||
|
||||
import org.elasticsearch.bootstrap.JavaVersion;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
|
@ -115,8 +114,6 @@ public class LongGCDisruptionTests extends ESTestCase {
|
|||
* but does keep retrying until all threads can be safely paused
|
||||
*/
|
||||
public void testNotBlockingUnsafeStackTraces() throws Exception {
|
||||
assumeFalse("https://github.com/elastic/elasticsearch/issues/50047",
|
||||
JavaVersion.current().equals(JavaVersion.parse("11")) || JavaVersion.current().equals(JavaVersion.parse("12")));
|
||||
final String nodeName = "test_node";
|
||||
LongGCDisruption disruption = new LongGCDisruption(random(), nodeName) {
|
||||
@Override
|
||||
|
@ -149,7 +146,14 @@ public class LongGCDisruptionTests extends ESTestCase {
|
|||
threads[i].start();
|
||||
}
|
||||
// make sure some threads are under lock
|
||||
disruption.startDisrupting();
|
||||
try {
|
||||
disruption.startDisrupting();
|
||||
} catch (RuntimeException e) {
|
||||
if (e.getMessage().contains("suspending node threads took too long") && disruption.sawSlowSuspendBug()) {
|
||||
return;
|
||||
}
|
||||
throw new AssertionError(e);
|
||||
}
|
||||
long first = ops.get();
|
||||
assertThat(lockedExecutor.lock.isLocked(), equalTo(false)); // no threads should own the lock
|
||||
Thread.sleep(100);
|
||||
|
@ -157,6 +161,7 @@ public class LongGCDisruptionTests extends ESTestCase {
|
|||
disruption.stopDisrupting();
|
||||
assertBusy(() -> assertThat(ops.get(), greaterThan(first)));
|
||||
} finally {
|
||||
disruption.stopDisrupting();
|
||||
stop.set(true);
|
||||
for (final Thread thread : threads) {
|
||||
thread.join();
|
||||
|
|
Loading…
Reference in New Issue