HADOOP-9090. Support on-demand publish of metrics. Contributed by Mostafa Elhemali.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1416538 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
235749a8ab
commit
e5d54ac89a
|
@ -306,6 +306,10 @@ Release 2.0.3-alpha - Unreleased
|
|||
|
||||
HADOOP-9020. Add a SASL PLAIN server (daryn via bobby)
|
||||
|
||||
HADOOP-9090. Support on-demand publish of metrics. (Mostafa Elhemali via
|
||||
suresh)
|
||||
|
||||
|
||||
IMPROVEMENTS
|
||||
|
||||
HADOOP-8789. Tests setLevel(Level.OFF) should be Level.ERROR.
|
||||
|
|
|
@ -90,6 +90,17 @@ public abstract class MetricsSystem implements MetricsSystemMXBean {
|
|||
*/
|
||||
public abstract void register(Callback callback);
|
||||
|
||||
/**
|
||||
* Requests an immediate publish of all metrics from sources to sinks.
|
||||
*
|
||||
* This is a "soft" request: the expectation is that a best effort will be
|
||||
* done to synchronously snapshot the metrics from all the sources and put
|
||||
* them in all the sinks (including flushing the sinks) before returning to
|
||||
* the caller. If this can't be accomplished in reasonable time it's OK to
|
||||
* return to the caller before everything is done.
|
||||
*/
|
||||
public abstract void publishMetricsNow();
|
||||
|
||||
/**
|
||||
* Shutdown the metrics system completely (usually during server shutdown.)
|
||||
* The MetricsSystemMXBean will be unregistered.
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
package org.apache.hadoop.metrics2.impl;
|
||||
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
import static com.google.common.base.Preconditions.*;
|
||||
|
||||
|
@ -48,6 +49,7 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
|
|||
private volatile boolean stopping = false;
|
||||
private volatile boolean inError = false;
|
||||
private final int period, firstRetryDelay, retryCount;
|
||||
private final long oobPutTimeout;
|
||||
private final float retryBackoff;
|
||||
private final MetricsRegistry registry = new MetricsRegistry("sinkadapter");
|
||||
private final MutableStat latency;
|
||||
|
@ -69,6 +71,8 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
|
|||
this.period = checkArg(period, period > 0, "period");
|
||||
firstRetryDelay = checkArg(retryDelay, retryDelay > 0, "retry delay");
|
||||
this.retryBackoff = checkArg(retryBackoff, retryBackoff>1, "retry backoff");
|
||||
oobPutTimeout = (long)
|
||||
(firstRetryDelay * Math.pow(retryBackoff, retryCount) * 1000);
|
||||
this.retryCount = retryCount;
|
||||
this.queue = new SinkQueue<MetricsBuffer>(checkArg(queueCapacity,
|
||||
queueCapacity > 0, "queue capacity"));
|
||||
|
@ -95,6 +99,23 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
|
|||
}
|
||||
return true; // OK
|
||||
}
|
||||
|
||||
public boolean putMetricsImmediate(MetricsBuffer buffer) {
|
||||
WaitableMetricsBuffer waitableBuffer =
|
||||
new WaitableMetricsBuffer(buffer);
|
||||
if (!queue.enqueue(waitableBuffer)) {
|
||||
LOG.warn(name + " has a full queue and can't consume the given metrics.");
|
||||
dropped.incr();
|
||||
return false;
|
||||
}
|
||||
if (!waitableBuffer.waitTillNotified(oobPutTimeout)) {
|
||||
LOG.warn(name +
|
||||
" couldn't fulfill an immediate putMetrics request in time." +
|
||||
" Abandoning.");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void publishMetricsFromQueue() {
|
||||
int retryDelay = firstRetryDelay;
|
||||
|
@ -158,6 +179,9 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
|
|||
sink.flush();
|
||||
latency.add(Time.now() - ts);
|
||||
}
|
||||
if (buffer instanceof WaitableMetricsBuffer) {
|
||||
((WaitableMetricsBuffer)buffer).notifyAnyWaiters();
|
||||
}
|
||||
LOG.debug("Done");
|
||||
}
|
||||
|
||||
|
@ -191,4 +215,26 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
|
|||
MetricsSink sink() {
|
||||
return sink;
|
||||
}
|
||||
|
||||
static class WaitableMetricsBuffer extends MetricsBuffer {
|
||||
private final Semaphore notificationSemaphore =
|
||||
new Semaphore(0);
|
||||
|
||||
public WaitableMetricsBuffer(MetricsBuffer metricsBuffer) {
|
||||
super(metricsBuffer);
|
||||
}
|
||||
|
||||
public boolean waitTillNotified(long millisecondsToWait) {
|
||||
try {
|
||||
return notificationSemaphore.tryAcquire(millisecondsToWait,
|
||||
TimeUnit.MILLISECONDS);
|
||||
} catch (InterruptedException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void notifyAnyWaiters() {
|
||||
notificationSemaphore.release();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -344,9 +344,19 @@ public class MetricsSystemImpl extends MetricsSystem implements MetricsSource {
|
|||
synchronized void onTimerEvent() {
|
||||
logicalTime += period;
|
||||
if (sinks.size() > 0) {
|
||||
publishMetrics(sampleMetrics());
|
||||
publishMetrics(sampleMetrics(), false);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Requests an immediate publish of all metrics from sources to sinks.
|
||||
*/
|
||||
@Override
|
||||
public void publishMetricsNow() {
|
||||
if (sinks.size() > 0) {
|
||||
publishMetrics(sampleMetrics(), true);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sample all the sources for a snapshot of metrics/tags
|
||||
|
@ -380,12 +390,20 @@ public class MetricsSystemImpl extends MetricsSystem implements MetricsSource {
|
|||
/**
|
||||
* Publish a metrics snapshot to all the sinks
|
||||
* @param buffer the metrics snapshot to publish
|
||||
* @param immediate indicates that we should publish metrics immediately
|
||||
* instead of using a separate thread.
|
||||
*/
|
||||
synchronized void publishMetrics(MetricsBuffer buffer) {
|
||||
synchronized void publishMetrics(MetricsBuffer buffer, boolean immediate) {
|
||||
int dropped = 0;
|
||||
for (MetricsSinkAdapter sa : sinks.values()) {
|
||||
long startTime = Time.now();
|
||||
dropped += sa.putMetrics(buffer, logicalTime) ? 0 : 1;
|
||||
boolean result;
|
||||
if (immediate) {
|
||||
result = sa.putMetricsImmediate(buffer);
|
||||
} else {
|
||||
result = sa.putMetrics(buffer, logicalTime);
|
||||
}
|
||||
dropped += result ? 0 : 1;
|
||||
publishStat.add(Time.now() - startTime);
|
||||
}
|
||||
droppedPubAll.incr(dropped);
|
||||
|
|
|
@ -29,8 +29,6 @@ import java.util.ArrayList;
|
|||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
@ -115,31 +113,23 @@ public class TestGangliaMetrics {
|
|||
final int expectedCountFromGanglia30 = expectedMetrics.length;
|
||||
final int expectedCountFromGanglia31 = 2 * expectedMetrics.length;
|
||||
|
||||
// use latch to make sure we received required records before shutting
|
||||
// down the MetricSystem
|
||||
CountDownLatch latch = new CountDownLatch(
|
||||
expectedCountFromGanglia30 + expectedCountFromGanglia31);
|
||||
|
||||
// Setup test for GangliaSink30
|
||||
AbstractGangliaSink gsink30 = new GangliaSink30();
|
||||
gsink30.init(cb.subset("test"));
|
||||
MockDatagramSocket mockds30 = new MockDatagramSocket(latch);
|
||||
MockDatagramSocket mockds30 = new MockDatagramSocket();
|
||||
GangliaMetricsTestHelper.setDatagramSocket(gsink30, mockds30);
|
||||
|
||||
// Setup test for GangliaSink31
|
||||
AbstractGangliaSink gsink31 = new GangliaSink31();
|
||||
gsink31.init(cb.subset("test"));
|
||||
MockDatagramSocket mockds31 = new MockDatagramSocket(latch);
|
||||
MockDatagramSocket mockds31 = new MockDatagramSocket();
|
||||
GangliaMetricsTestHelper.setDatagramSocket(gsink31, mockds31);
|
||||
|
||||
// register the sinks
|
||||
ms.register("gsink30", "gsink30 desc", gsink30);
|
||||
ms.register("gsink31", "gsink31 desc", gsink31);
|
||||
ms.onTimerEvent(); // trigger something interesting
|
||||
ms.publishMetricsNow(); // publish the metrics
|
||||
|
||||
// wait for all records and the stop MetricSystem. Without this
|
||||
// sometime the ms gets shutdown before all the sinks have consumed
|
||||
latch.await(200, TimeUnit.MILLISECONDS);
|
||||
ms.stop();
|
||||
|
||||
// check GanfliaSink30 data
|
||||
|
@ -198,7 +188,6 @@ public class TestGangliaMetrics {
|
|||
*/
|
||||
private class MockDatagramSocket extends DatagramSocket {
|
||||
private ArrayList<byte[]> capture;
|
||||
private CountDownLatch latch;
|
||||
|
||||
/**
|
||||
* @throws SocketException
|
||||
|
@ -207,15 +196,6 @@ public class TestGangliaMetrics {
|
|||
capture = new ArrayList<byte[]>();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param latch
|
||||
* @throws SocketException
|
||||
*/
|
||||
public MockDatagramSocket(CountDownLatch latch) throws SocketException {
|
||||
this();
|
||||
this.latch = latch;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see java.net.DatagramSocket#send(java.net.DatagramPacket)
|
||||
*/
|
||||
|
@ -225,9 +205,6 @@ public class TestGangliaMetrics {
|
|||
byte[] bytes = new byte[p.getLength()];
|
||||
System.arraycopy(p.getData(), p.getOffset(), bytes, 0, p.getLength());
|
||||
capture.add(bytes);
|
||||
|
||||
// decrement the latch
|
||||
latch.countDown();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -18,7 +18,11 @@
|
|||
|
||||
package org.apache.hadoop.metrics2.impl;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.atomic.*;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
|
@ -26,9 +30,11 @@ import org.junit.runner.RunWith;
|
|||
import org.mockito.ArgumentCaptor;
|
||||
import org.mockito.Captor;
|
||||
import org.mockito.runners.MockitoJUnitRunner;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import static org.mockito.Mockito.*;
|
||||
|
||||
import com.google.common.base.Predicate;
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
import org.apache.commons.configuration.SubsetConfiguration;
|
||||
|
@ -36,6 +42,8 @@ import org.apache.commons.logging.Log;
|
|||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.metrics2.MetricsException;
|
||||
import static org.apache.hadoop.test.MoreAsserts.*;
|
||||
|
||||
import org.apache.hadoop.metrics2.AbstractMetric;
|
||||
import org.apache.hadoop.metrics2.MetricsRecord;
|
||||
import org.apache.hadoop.metrics2.MetricsSink;
|
||||
import org.apache.hadoop.metrics2.MetricsSource;
|
||||
|
@ -47,6 +55,7 @@ import org.apache.hadoop.metrics2.lib.MetricsRegistry;
|
|||
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
|
||||
import org.apache.hadoop.metrics2.lib.MutableRate;
|
||||
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
|
||||
/**
|
||||
* Test the MetricsSystemImpl class
|
||||
|
@ -72,7 +81,7 @@ public class TestMetricsSystemImpl {
|
|||
}
|
||||
|
||||
@Test public void testInitFirst() throws Exception {
|
||||
ConfigBuilder cb = new ConfigBuilder().add("*.period", 8)
|
||||
new ConfigBuilder().add("*.period", 8)
|
||||
//.add("test.sink.plugin.urls", getPluginUrlsAsString())
|
||||
.add("test.sink.test.class", TestSink.class.getName())
|
||||
.add("test.*.source.filter.exclude", "s0")
|
||||
|
@ -93,8 +102,9 @@ public class TestMetricsSystemImpl {
|
|||
MetricsSink sink2 = mock(MetricsSink.class);
|
||||
ms.registerSink("sink1", "sink1 desc", sink1);
|
||||
ms.registerSink("sink2", "sink2 desc", sink2);
|
||||
ms.onTimerEvent(); // trigger something interesting
|
||||
ms.publishMetricsNow(); // publish the metrics
|
||||
ms.stop();
|
||||
ms.shutdown();
|
||||
|
||||
verify(sink1, times(2)).putMetrics(r1.capture());
|
||||
List<MetricsRecord> mr1 = r1.getAllValues();
|
||||
|
@ -104,6 +114,177 @@ public class TestMetricsSystemImpl {
|
|||
assertEquals("output", mr1, mr2);
|
||||
}
|
||||
|
||||
@Test public void testMultiThreadedPublish() throws Exception {
|
||||
new ConfigBuilder().add("*.period", 80)
|
||||
.add("test.sink.Collector.queue.capacity", "20")
|
||||
.save(TestMetricsConfig.getTestFilename("hadoop-metrics2-test"));
|
||||
final MetricsSystemImpl ms = new MetricsSystemImpl("Test");
|
||||
ms.start();
|
||||
final int numThreads = 10;
|
||||
final CollectingSink sink = new CollectingSink(numThreads);
|
||||
ms.registerSink("Collector",
|
||||
"Collector of values from all threads.", sink);
|
||||
final TestSource[] sources = new TestSource[numThreads];
|
||||
final Thread[] threads = new Thread[numThreads];
|
||||
final String[] results = new String[numThreads];
|
||||
final CyclicBarrier barrier1 = new CyclicBarrier(numThreads),
|
||||
barrier2 = new CyclicBarrier(numThreads);
|
||||
for (int i = 0; i < numThreads; i++) {
|
||||
sources[i] = ms.register("threadSource" + i,
|
||||
"A source of my threaded goodness.",
|
||||
new TestSource("threadSourceRec" + i));
|
||||
threads[i] = new Thread(new Runnable() {
|
||||
private boolean safeAwait(int mySource, CyclicBarrier barrier) {
|
||||
try {
|
||||
barrier1.await(2, TimeUnit.SECONDS);
|
||||
} catch (InterruptedException e) {
|
||||
results[mySource] = "Interrupted";
|
||||
return false;
|
||||
} catch (BrokenBarrierException e) {
|
||||
results[mySource] = "Broken Barrier";
|
||||
return false;
|
||||
} catch (TimeoutException e) {
|
||||
results[mySource] = "Timed out on barrier";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
int mySource = Integer.parseInt(Thread.currentThread().getName());
|
||||
if (sink.collected[mySource].get() != 0L) {
|
||||
results[mySource] = "Someone else collected my metric!";
|
||||
return;
|
||||
}
|
||||
// Wait for all the threads to come here so we can hammer
|
||||
// the system at the same time
|
||||
if (!safeAwait(mySource, barrier1)) return;
|
||||
sources[mySource].g1.set(230);
|
||||
ms.publishMetricsNow();
|
||||
// Since some other thread may have snatched my metric,
|
||||
// I need to wait for the threads to finish before checking.
|
||||
if (!safeAwait(mySource, barrier2)) return;
|
||||
if (sink.collected[mySource].get() != 230L) {
|
||||
results[mySource] = "Metric not collected!";
|
||||
return;
|
||||
}
|
||||
results[mySource] = "Passed";
|
||||
}
|
||||
}, "" + i);
|
||||
}
|
||||
for (Thread t : threads)
|
||||
t.start();
|
||||
for (Thread t : threads)
|
||||
t.join();
|
||||
assertEquals(0L, ms.droppedPubAll.value());
|
||||
assertTrue(StringUtils.join("\n", Arrays.asList(results)),
|
||||
Iterables.all(Arrays.asList(results), new Predicate<String>() {
|
||||
@Override
|
||||
public boolean apply(@Nullable String input) {
|
||||
return input.equalsIgnoreCase("Passed");
|
||||
}
|
||||
}));
|
||||
ms.stop();
|
||||
ms.shutdown();
|
||||
}
|
||||
|
||||
private static class CollectingSink implements MetricsSink {
|
||||
private final AtomicLong[] collected;
|
||||
|
||||
public CollectingSink(int capacity) {
|
||||
collected = new AtomicLong[capacity];
|
||||
for (int i = 0; i < capacity; i++) {
|
||||
collected[i] = new AtomicLong();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(SubsetConfiguration conf) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void putMetrics(MetricsRecord record) {
|
||||
final String prefix = "threadSourceRec";
|
||||
if (record.name().startsWith(prefix)) {
|
||||
final int recordNumber = Integer.parseInt(
|
||||
record.name().substring(prefix.length()));
|
||||
ArrayList<String> names = new ArrayList<String>();
|
||||
for (AbstractMetric m : record.metrics()) {
|
||||
if (m.name().equalsIgnoreCase("g1")) {
|
||||
collected[recordNumber].set(m.value().longValue());
|
||||
return;
|
||||
}
|
||||
names.add(m.name());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flush() {
|
||||
}
|
||||
}
|
||||
|
||||
@Test public void testHangingSink() {
|
||||
new ConfigBuilder().add("*.period", 8)
|
||||
.add("test.sink.test.class", TestSink.class.getName())
|
||||
.add("test.sink.hanging.retry.delay", "1")
|
||||
.add("test.sink.hanging.retry.backoff", "1.01")
|
||||
.add("test.sink.hanging.retry.count", "0")
|
||||
.save(TestMetricsConfig.getTestFilename("hadoop-metrics2-test"));
|
||||
MetricsSystemImpl ms = new MetricsSystemImpl("Test");
|
||||
ms.start();
|
||||
TestSource s = ms.register("s3", "s3 desc", new TestSource("s3rec"));
|
||||
s.c1.incr();
|
||||
HangingSink hanging = new HangingSink();
|
||||
ms.registerSink("hanging", "Hang the sink!", hanging);
|
||||
ms.publishMetricsNow();
|
||||
assertEquals(1L, ms.droppedPubAll.value());
|
||||
assertFalse(hanging.getInterrupted());
|
||||
ms.stop();
|
||||
ms.shutdown();
|
||||
assertTrue(hanging.getInterrupted());
|
||||
assertTrue("The sink didn't get called after its first hang " +
|
||||
"for subsequent records.", hanging.getGotCalledSecondTime());
|
||||
}
|
||||
|
||||
private static class HangingSink implements MetricsSink {
|
||||
private volatile boolean interrupted;
|
||||
private boolean gotCalledSecondTime;
|
||||
private boolean firstTime = true;
|
||||
|
||||
public boolean getGotCalledSecondTime() {
|
||||
return gotCalledSecondTime;
|
||||
}
|
||||
|
||||
public boolean getInterrupted() {
|
||||
return interrupted;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(SubsetConfiguration conf) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void putMetrics(MetricsRecord record) {
|
||||
// No need to hang every time, just the first record.
|
||||
if (!firstTime) {
|
||||
gotCalledSecondTime = true;
|
||||
return;
|
||||
}
|
||||
firstTime = false;
|
||||
try {
|
||||
Thread.sleep(10 * 1000);
|
||||
} catch (InterruptedException ex) {
|
||||
interrupted = true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flush() {
|
||||
}
|
||||
}
|
||||
|
||||
@Test public void testRegisterDups() {
|
||||
MetricsSystem ms = new MetricsSystemImpl();
|
||||
TestSource ts1 = new TestSource("ts1");
|
||||
|
@ -116,6 +297,7 @@ public class TestMetricsSystemImpl {
|
|||
MetricsSource s2 = ms.getSource("ts1");
|
||||
assertNotNull(s2);
|
||||
assertNotSame(s1, s2);
|
||||
ms.shutdown();
|
||||
}
|
||||
|
||||
@Test(expected=MetricsException.class) public void testRegisterDupError() {
|
||||
|
|
Loading…
Reference in New Issue