ARTEMIS-3271 Improve Critical Analyzer to sample single therads

worked in collaboration with Gary Tully on this fix
This commit is contained in:
Clebert Suconic 2021-04-27 14:15:03 -04:00 committed by Gary Tully
parent 42405fedcf
commit 39f468450b
4 changed files with 166 additions and 36 deletions

View File

@ -18,6 +18,7 @@
package org.apache.activemq.artemis.utils.critical;
import java.util.concurrent.atomic.AtomicLongFieldUpdater;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
import org.jboss.logging.Logger;
@ -30,10 +31,15 @@ public class CriticalMeasure {
//uses updaters to avoid creates many AtomicLong instances
static final AtomicLongFieldUpdater<CriticalMeasure> TIME_ENTER_UPDATER = AtomicLongFieldUpdater.newUpdater(CriticalMeasure.class, "timeEnter");
static final AtomicLongFieldUpdater<CriticalMeasure> TIME_LEFT_UPDATER = AtomicLongFieldUpdater.newUpdater(CriticalMeasure.class, "timeLeft");
static final AtomicReferenceFieldUpdater<CriticalMeasure, Thread> CURRENT_THREAD_UDPATER = AtomicReferenceFieldUpdater.newUpdater(CriticalMeasure.class, Thread.class, "currentThread");
// While reseting the leaveMethod, I want to make sure no enter call would reset the value.
// so I set the Current Thread to this Ghost Thread, to then set it back to null
private static final Thread GHOST_THREAD = new Thread();
private volatile Thread currentThread;
private volatile long timeEnter;
private volatile long timeLeft;
private final int id;
private final CriticalComponent component;
@ -44,35 +50,49 @@ public class CriticalMeasure {
//prefer this approach instead of using some fixed value because System::nanoTime could change sign
//with long running processes
long time = System.nanoTime();
TIME_LEFT_UPDATER.set(this, time);
TIME_ENTER_UPDATER.set(this, time);
TIME_ENTER_UPDATER.set(this, 0);
}
public void enterCritical() {
//prefer lazySet in order to avoid heavy-weight full barriers on x86
TIME_ENTER_UPDATER.lazySet(this, System.nanoTime());
if (logger.isTraceEnabled()) {
traceEnter = new Exception("entered");
// We should only measure a single thread from all of the threads.
// We are fine to ignore any other calls made to this component, we only need one thread measured
if (CURRENT_THREAD_UDPATER.compareAndSet(this, null, Thread.currentThread())) {
//prefer lazySet in order to avoid heavy-weight full barriers on x86
TIME_ENTER_UPDATER.lazySet(this, System.nanoTime());
if (logger.isTraceEnabled()) {
traceEnter = new Exception("entered");
}
}
}
public void leaveCritical() {
if (logger.isTraceEnabled()) {
// We should only measure a single thread from all of the threads.
// We are fine to ignore any other calls made to this component, we only need one thread measured
if (CURRENT_THREAD_UDPATER.compareAndSet(this, Thread.currentThread(), GHOST_THREAD)) {
// NULL_THREAD here represents a state where I would be ignoring any call to enterCritical or levalCritical, while I reset the Time Enter Update
// This is to avoid replacing time Enter by a new Value, right after current Thread is set to Null.
// So we set to this ghost value while we are setting
CriticalAnalyzer analyzer = component != null ? component.getCriticalAnalyzer() : null;
if (analyzer != null) {
long nanoTimeout = analyzer.getTimeoutNanoSeconds();
if (checkExpiration(nanoTimeout, false)) {
logger.trace("Path " + id + " on component " + getComponentName() + " is taking too long, leaving at", new Exception("left"));
logger.trace("Path " + id + " on component " + getComponentName() + " is taking too long, entered at", traceEnter);
if (logger.isTraceEnabled()) {
CriticalAnalyzer analyzer = component != null ? component.getCriticalAnalyzer() : null;
if (analyzer != null) {
long nanoTimeout = analyzer.getTimeoutNanoSeconds();
if (checkExpiration(nanoTimeout, false)) {
logger.trace("Path " + id + " on component " + getComponentName() + " is taking too long, leaving at", new Exception("left"));
logger.trace("Path " + id + " on component " + getComponentName() + " is taking too long, entered at", traceEnter);
}
}
traceEnter = null;
}
traceEnter = null;
}
TIME_ENTER_UPDATER.set(this, 0);
TIME_LEFT_UPDATER.lazySet(this, System.nanoTime());
// I am pretty sure this is single threaded by now.. I don't need compareAndSet here
CURRENT_THREAD_UDPATER.set(this, null);
}
}
protected String getComponentName() {
@ -84,12 +104,10 @@ public class CriticalMeasure {
}
public boolean checkExpiration(long timeout, boolean reset) {
long time = System.nanoTime();
final long timeLeft = TIME_LEFT_UPDATER.get(this);
final long timeEnter = TIME_ENTER_UPDATER.get(this);
//due to how System::nanoTime works is better to use differences to prevent numerical overflow while comparing
if (timeLeft - timeEnter < 0) {
boolean expired = System.nanoTime() - timeEnter > timeout;
if (timeEnter != 0L) {
long time = System.nanoTime();
boolean expired = time - timeEnter > timeout;
if (expired) {
Exception lastTraceEnter = this.traceEnter;
@ -100,10 +118,6 @@ public class CriticalMeasure {
logger.warn("Component " + getComponentName() + " is expired on path " + id);
}
if (reset) {
TIME_LEFT_UPDATER.lazySet(this, time);
TIME_ENTER_UPDATER.lazySet(this, time);
}
}
return expired;
}
@ -113,8 +127,4 @@ public class CriticalMeasure {
public long enterTime() {
return TIME_ENTER_UPDATER.get(this);
}
public long leaveTime() {
return TIME_LEFT_UPDATER.get(this);
}
}

View File

@ -156,6 +156,13 @@ public class Wait {
public static boolean waitFor(final Condition condition,
final long durationMillis,
final long sleepMillis) {
return waitFor(condition, durationMillis, sleepMillis, true);
}
public static boolean waitFor(final Condition condition,
final long durationMillis,
final long sleepMillis,
final boolean printThreadDump) {
try {
final long expiry = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(durationMillis);
@ -168,7 +175,7 @@ public class Wait {
}
conditionSatisified = condition.isSatisfied();
}
if (!conditionSatisified) {
if (!conditionSatisified && printThreadDump) {
System.out.println(ThreadDumpUtil.threadDump("thread dump"));
}
return conditionSatisified;

View File

@ -28,8 +28,7 @@ public class CriticalMeasureTest {
public void testCriticalMeasure() throws Exception {
CriticalMeasure measure = new CriticalMeasure(null, 1);
long time = System.nanoTime();
CriticalMeasure.TIME_ENTER_UPDATER.set(measure, time - TimeUnit.MINUTES.toNanos(5));
CriticalMeasure.TIME_LEFT_UPDATER.set(measure, time);
CriticalMeasure.TIME_ENTER_UPDATER.set(measure, time - TimeUnit.SECONDS.toNanos(5));
Assert.assertFalse(measure.checkExpiration(TimeUnit.SECONDS.toNanos(30), false));
}
@ -39,7 +38,8 @@ public class CriticalMeasureTest {
CriticalComponent component = new CriticalComponentImpl(analyzer, 5);
CriticalMeasure measure = new CriticalMeasure(component, 1);
long time = System.nanoTime();
CriticalMeasure.TIME_ENTER_UPDATER.set(measure, time - TimeUnit.MINUTES.toNanos(5));
CriticalMeasure.CURRENT_THREAD_UDPATER.set(measure, Thread.currentThread());
CriticalMeasure.TIME_ENTER_UPDATER.set(measure, time - TimeUnit.MINUTES.toNanos(30));
measure.leaveCritical();
Assert.assertFalse(measure.checkExpiration(TimeUnit.SECONDS.toNanos(30), false));
}
@ -52,7 +52,6 @@ public class CriticalMeasureTest {
long time = System.nanoTime();
measure.enterCritical();
CriticalMeasure.TIME_ENTER_UPDATER.set(measure, time - TimeUnit.MINUTES.toNanos(5));
CriticalMeasure.TIME_LEFT_UPDATER.set(measure, time - TimeUnit.MINUTES.toNanos(10));
Assert.assertTrue(measure.checkExpiration(TimeUnit.SECONDS.toNanos(30), false));
measure.leaveCritical();
}

View File

@ -0,0 +1,114 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.activemq.artemis.utils.critical;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.LockSupport;
import org.apache.activemq.artemis.utils.ReusableLatch;
import org.apache.activemq.artemis.utils.Wait;
import org.jboss.logging.Logger;
import org.junit.Assert;
import org.junit.Test;
public class MultiThreadCriticalMeasureTest {
private static final Logger logger = Logger.getLogger(MultiThreadCriticalMeasureTest.class);
@Test
public void testMultiThread() throws Throwable {
int THREADS = 100;
AtomicInteger errors = new AtomicInteger(0);
Thread[] threads = new Thread[THREADS];
AtomicBoolean running = new AtomicBoolean(true);
ReusableLatch latch = new ReusableLatch(0);
ReusableLatch latchOnMeasure = new ReusableLatch(0);
try {
CriticalMeasure measure = new CriticalMeasure((t, r) -> false, 0);
CyclicBarrier barrier = new CyclicBarrier(THREADS + 1);
Runnable runnable = () -> {
try {
logger.debug("Thread " + Thread.currentThread().getName() + " waiting to Star");
barrier.await();
logger.debug("Thread " + Thread.currentThread().getName() + " Started");
while (running.get()) {
if (!latch.await(1, TimeUnit.NANOSECONDS)) {
latch.await();
}
measure.enterCritical();
latchOnMeasure.await();
measure.leaveCritical();
}
} catch (Throwable e) {
e.printStackTrace();
errors.incrementAndGet();
}
};
for (int i = 0; i < THREADS; i++) {
threads[i] = new Thread(runnable, "t=" + i);
threads[i].start();
}
logger.debug("Going to release it now");
barrier.await();
for (int i = 0; i < 50; i++) {
LockSupport.parkNanos(TimeUnit.MILLISECONDS.toNanos(10));
logger.debug("Count up " + i);
// simulating load down on the system... this will freeze load
latch.countUp();
LockSupport.parkNanos(TimeUnit.MILLISECONDS.toNanos(20));
Assert.assertFalse(measure.checkExpiration(TimeUnit.MILLISECONDS.toNanos(10), false));
logger.debug("Count down");
// this will resume load
latch.countDown();
}
latchOnMeasure.countUp();
Assert.assertTrue(Wait.waitFor(() -> measure.checkExpiration(TimeUnit.MILLISECONDS.toNanos(100), false), 1_000, 1));
} finally {
latch.countDown();
latchOnMeasure.countDown();
running.set(false);
Assert.assertEquals(0, errors.get());
for (Thread t : threads) {
if (t != null) {
t.join(100);
if (t.isAlive()) {
t.interrupt();
}
}
}
}
}
}