There is a JVM bug causing `Thread#suspend` calls to randomly take multiple seconds breaking these tests that call the method numerous times in a loop. Increasing the timeout would will not work since we may call `suspend` tens if not hundreds of times and even a small number of them experiencing the blocking will lead to multiple minutes of waiting. This PR detects the specific issue by timing the `Thread#suspend` calls and skips the remainder of the test if it timed out because of the JVM bug. Closes #50047
This commit is contained in:
parent
d8510be3d9
commit
6e8ea7aaa2
|
@ -32,6 +32,8 @@ import java.util.Arrays;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -56,11 +58,23 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
||||||
private Set<Thread> suspendedThreads;
|
private Set<Thread> suspendedThreads;
|
||||||
private Thread blockDetectionThread;
|
private Thread blockDetectionThread;
|
||||||
|
|
||||||
|
private final AtomicBoolean sawSlowSuspendBug = new AtomicBoolean(false);
|
||||||
|
|
||||||
public LongGCDisruption(Random random, String disruptedNode) {
|
public LongGCDisruption(Random random, String disruptedNode) {
|
||||||
super(random);
|
super(random);
|
||||||
this.disruptedNode = disruptedNode;
|
this.disruptedNode = disruptedNode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if during disruption we ran into a known JVM issue that makes {@link Thread#suspend()} calls block for multiple seconds
|
||||||
|
* was observed.
|
||||||
|
* @see <a href=https://bugs.openjdk.java.net/browse/JDK-8218446>JDK-8218446</a>
|
||||||
|
* @return true if during thread suspending a call to {@link Thread#suspend()} took more than 3s
|
||||||
|
*/
|
||||||
|
public boolean sawSlowSuspendBug() {
|
||||||
|
return sawSlowSuspendBug.get();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public synchronized void startDisrupting() {
|
public synchronized void startDisrupting() {
|
||||||
if (suspendedThreads == null) {
|
if (suspendedThreads == null) {
|
||||||
|
@ -251,7 +265,11 @@ public class LongGCDisruption extends SingleNodeDisruption {
|
||||||
* assuming that it is safe.
|
* assuming that it is safe.
|
||||||
*/
|
*/
|
||||||
boolean definitelySafe = true;
|
boolean definitelySafe = true;
|
||||||
|
final long startTime = System.nanoTime();
|
||||||
thread.suspend();
|
thread.suspend();
|
||||||
|
if (System.nanoTime() - startTime > TimeUnit.SECONDS.toNanos(3L)) {
|
||||||
|
sawSlowSuspendBug.set(true);
|
||||||
|
}
|
||||||
// double check the thread is not in a shared resource like logging; if so, let it go and come back
|
// double check the thread is not in a shared resource like logging; if so, let it go and come back
|
||||||
safe:
|
safe:
|
||||||
for (StackTraceElement stackElement : thread.getStackTrace()) {
|
for (StackTraceElement stackElement : thread.getStackTrace()) {
|
||||||
|
|
|
@ -18,7 +18,6 @@
|
||||||
*/
|
*/
|
||||||
package org.elasticsearch.test.disruption;
|
package org.elasticsearch.test.disruption;
|
||||||
|
|
||||||
import org.elasticsearch.bootstrap.JavaVersion;
|
|
||||||
import org.elasticsearch.common.Nullable;
|
import org.elasticsearch.common.Nullable;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
|
||||||
|
@ -115,8 +114,6 @@ public class LongGCDisruptionTests extends ESTestCase {
|
||||||
* but does keep retrying until all threads can be safely paused
|
* but does keep retrying until all threads can be safely paused
|
||||||
*/
|
*/
|
||||||
public void testNotBlockingUnsafeStackTraces() throws Exception {
|
public void testNotBlockingUnsafeStackTraces() throws Exception {
|
||||||
assumeFalse("https://github.com/elastic/elasticsearch/issues/50047",
|
|
||||||
JavaVersion.current().equals(JavaVersion.parse("11")) || JavaVersion.current().equals(JavaVersion.parse("12")));
|
|
||||||
final String nodeName = "test_node";
|
final String nodeName = "test_node";
|
||||||
LongGCDisruption disruption = new LongGCDisruption(random(), nodeName) {
|
LongGCDisruption disruption = new LongGCDisruption(random(), nodeName) {
|
||||||
@Override
|
@Override
|
||||||
|
@ -149,7 +146,14 @@ public class LongGCDisruptionTests extends ESTestCase {
|
||||||
threads[i].start();
|
threads[i].start();
|
||||||
}
|
}
|
||||||
// make sure some threads are under lock
|
// make sure some threads are under lock
|
||||||
disruption.startDisrupting();
|
try {
|
||||||
|
disruption.startDisrupting();
|
||||||
|
} catch (RuntimeException e) {
|
||||||
|
if (e.getMessage().contains("suspending node threads took too long") && disruption.sawSlowSuspendBug()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
throw new AssertionError(e);
|
||||||
|
}
|
||||||
long first = ops.get();
|
long first = ops.get();
|
||||||
assertThat(lockedExecutor.lock.isLocked(), equalTo(false)); // no threads should own the lock
|
assertThat(lockedExecutor.lock.isLocked(), equalTo(false)); // no threads should own the lock
|
||||||
Thread.sleep(100);
|
Thread.sleep(100);
|
||||||
|
@ -157,6 +161,7 @@ public class LongGCDisruptionTests extends ESTestCase {
|
||||||
disruption.stopDisrupting();
|
disruption.stopDisrupting();
|
||||||
assertBusy(() -> assertThat(ops.get(), greaterThan(first)));
|
assertBusy(() -> assertThat(ops.get(), greaterThan(first)));
|
||||||
} finally {
|
} finally {
|
||||||
|
disruption.stopDisrupting();
|
||||||
stop.set(true);
|
stop.set(true);
|
||||||
for (final Thread thread : threads) {
|
for (final Thread thread : threads) {
|
||||||
thread.join();
|
thread.join();
|
||||||
|
|
Loading…
Reference in New Issue