HADOOP-9090. Support on-demand publish of metrics. Contributed by Mostafa Elhemali.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1416538 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Suresh Srinivas 2012-12-03 14:23:16 +00:00
parent 235749a8ab
commit e5d54ac89a
6 changed files with 270 additions and 32 deletions

View File

@ -306,6 +306,10 @@ Release 2.0.3-alpha - Unreleased
HADOOP-9020. Add a SASL PLAIN server (daryn via bobby)
HADOOP-9090. Support on-demand publish of metrics. (Mostafa Elhemali via
suresh)
IMPROVEMENTS
HADOOP-8789. Tests setLevel(Level.OFF) should be Level.ERROR.

View File

@ -90,6 +90,17 @@ public abstract class MetricsSystem implements MetricsSystemMXBean {
*/
public abstract void register(Callback callback);
/**
* Requests an immediate publish of all metrics from sources to sinks.
*
* This is a "soft" request: the expectation is that a best effort will be
* done to synchronously snapshot the metrics from all the sources and put
* them in all the sinks (including flushing the sinks) before returning to
* the caller. If this can't be accomplished in reasonable time it's OK to
* return to the caller before everything is done.
*/
public abstract void publishMetricsNow();
/**
* Shutdown the metrics system completely (usually during server shutdown.)
* The MetricsSystemMXBean will be unregistered.

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.metrics2.impl;
import java.util.Random;
import java.util.concurrent.*;
import static com.google.common.base.Preconditions.*;
@ -48,6 +49,7 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
private volatile boolean stopping = false;
private volatile boolean inError = false;
private final int period, firstRetryDelay, retryCount;
private final long oobPutTimeout;
private final float retryBackoff;
private final MetricsRegistry registry = new MetricsRegistry("sinkadapter");
private final MutableStat latency;
@ -69,6 +71,8 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
this.period = checkArg(period, period > 0, "period");
firstRetryDelay = checkArg(retryDelay, retryDelay > 0, "retry delay");
this.retryBackoff = checkArg(retryBackoff, retryBackoff>1, "retry backoff");
oobPutTimeout = (long)
(firstRetryDelay * Math.pow(retryBackoff, retryCount) * 1000);
this.retryCount = retryCount;
this.queue = new SinkQueue<MetricsBuffer>(checkArg(queueCapacity,
queueCapacity > 0, "queue capacity"));
@ -95,6 +99,23 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
}
return true; // OK
}
public boolean putMetricsImmediate(MetricsBuffer buffer) {
WaitableMetricsBuffer waitableBuffer =
new WaitableMetricsBuffer(buffer);
if (!queue.enqueue(waitableBuffer)) {
LOG.warn(name + " has a full queue and can't consume the given metrics.");
dropped.incr();
return false;
}
if (!waitableBuffer.waitTillNotified(oobPutTimeout)) {
LOG.warn(name +
" couldn't fulfill an immediate putMetrics request in time." +
" Abandoning.");
return false;
}
return true;
}
void publishMetricsFromQueue() {
int retryDelay = firstRetryDelay;
@ -158,6 +179,9 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
sink.flush();
latency.add(Time.now() - ts);
}
if (buffer instanceof WaitableMetricsBuffer) {
((WaitableMetricsBuffer)buffer).notifyAnyWaiters();
}
LOG.debug("Done");
}
@ -191,4 +215,26 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
MetricsSink sink() {
return sink;
}
static class WaitableMetricsBuffer extends MetricsBuffer {
private final Semaphore notificationSemaphore =
new Semaphore(0);
public WaitableMetricsBuffer(MetricsBuffer metricsBuffer) {
super(metricsBuffer);
}
public boolean waitTillNotified(long millisecondsToWait) {
try {
return notificationSemaphore.tryAcquire(millisecondsToWait,
TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
return false;
}
}
public void notifyAnyWaiters() {
notificationSemaphore.release();
}
}
}

View File

@ -344,9 +344,19 @@ public class MetricsSystemImpl extends MetricsSystem implements MetricsSource {
synchronized void onTimerEvent() {
logicalTime += period;
if (sinks.size() > 0) {
publishMetrics(sampleMetrics());
publishMetrics(sampleMetrics(), false);
}
}
/**
* Requests an immediate publish of all metrics from sources to sinks.
*/
@Override
public void publishMetricsNow() {
if (sinks.size() > 0) {
publishMetrics(sampleMetrics(), true);
}
}
/**
* Sample all the sources for a snapshot of metrics/tags
@ -380,12 +390,20 @@ public class MetricsSystemImpl extends MetricsSystem implements MetricsSource {
/**
* Publish a metrics snapshot to all the sinks
* @param buffer the metrics snapshot to publish
* @param immediate indicates that we should publish metrics immediately
* instead of using a separate thread.
*/
synchronized void publishMetrics(MetricsBuffer buffer) {
synchronized void publishMetrics(MetricsBuffer buffer, boolean immediate) {
int dropped = 0;
for (MetricsSinkAdapter sa : sinks.values()) {
long startTime = Time.now();
dropped += sa.putMetrics(buffer, logicalTime) ? 0 : 1;
boolean result;
if (immediate) {
result = sa.putMetricsImmediate(buffer);
} else {
result = sa.putMetrics(buffer, logicalTime);
}
dropped += result ? 0 : 1;
publishStat.add(Time.now() - startTime);
}
droppedPubAll.incr(dropped);

View File

@ -29,8 +29,6 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -115,31 +113,23 @@ public class TestGangliaMetrics {
final int expectedCountFromGanglia30 = expectedMetrics.length;
final int expectedCountFromGanglia31 = 2 * expectedMetrics.length;
// use latch to make sure we received required records before shutting
// down the MetricSystem
CountDownLatch latch = new CountDownLatch(
expectedCountFromGanglia30 + expectedCountFromGanglia31);
// Setup test for GangliaSink30
AbstractGangliaSink gsink30 = new GangliaSink30();
gsink30.init(cb.subset("test"));
MockDatagramSocket mockds30 = new MockDatagramSocket(latch);
MockDatagramSocket mockds30 = new MockDatagramSocket();
GangliaMetricsTestHelper.setDatagramSocket(gsink30, mockds30);
// Setup test for GangliaSink31
AbstractGangliaSink gsink31 = new GangliaSink31();
gsink31.init(cb.subset("test"));
MockDatagramSocket mockds31 = new MockDatagramSocket(latch);
MockDatagramSocket mockds31 = new MockDatagramSocket();
GangliaMetricsTestHelper.setDatagramSocket(gsink31, mockds31);
// register the sinks
ms.register("gsink30", "gsink30 desc", gsink30);
ms.register("gsink31", "gsink31 desc", gsink31);
ms.onTimerEvent(); // trigger something interesting
ms.publishMetricsNow(); // publish the metrics
// wait for all records and the stop MetricSystem. Without this
// sometime the ms gets shutdown before all the sinks have consumed
latch.await(200, TimeUnit.MILLISECONDS);
ms.stop();
// check GanfliaSink30 data
@ -198,7 +188,6 @@ public class TestGangliaMetrics {
*/
private class MockDatagramSocket extends DatagramSocket {
private ArrayList<byte[]> capture;
private CountDownLatch latch;
/**
* @throws SocketException
@ -207,15 +196,6 @@ public class TestGangliaMetrics {
capture = new ArrayList<byte[]>();
}
/**
* @param latch
* @throws SocketException
*/
public MockDatagramSocket(CountDownLatch latch) throws SocketException {
this();
this.latch = latch;
}
/* (non-Javadoc)
* @see java.net.DatagramSocket#send(java.net.DatagramPacket)
*/
@ -225,9 +205,6 @@ public class TestGangliaMetrics {
byte[] bytes = new byte[p.getLength()];
System.arraycopy(p.getData(), p.getOffset(), bytes, 0, p.getLength());
capture.add(bytes);
// decrement the latch
latch.countDown();
}
/**

View File

@ -18,7 +18,11 @@
package org.apache.hadoop.metrics2.impl;
import java.util.List;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.*;
import javax.annotation.Nullable;
import org.junit.Test;
import org.junit.runner.RunWith;
@ -26,9 +30,11 @@ import org.junit.runner.RunWith;
import org.mockito.ArgumentCaptor;
import org.mockito.Captor;
import org.mockito.runners.MockitoJUnitRunner;
import static org.junit.Assert.*;
import static org.mockito.Mockito.*;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import org.apache.commons.configuration.SubsetConfiguration;
@ -36,6 +42,8 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.metrics2.MetricsException;
import static org.apache.hadoop.test.MoreAsserts.*;
import org.apache.hadoop.metrics2.AbstractMetric;
import org.apache.hadoop.metrics2.MetricsRecord;
import org.apache.hadoop.metrics2.MetricsSink;
import org.apache.hadoop.metrics2.MetricsSource;
@ -47,6 +55,7 @@ import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
import org.apache.hadoop.metrics2.lib.MutableRate;
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
import org.apache.hadoop.util.StringUtils;
/**
* Test the MetricsSystemImpl class
@ -72,7 +81,7 @@ public class TestMetricsSystemImpl {
}
@Test public void testInitFirst() throws Exception {
ConfigBuilder cb = new ConfigBuilder().add("*.period", 8)
new ConfigBuilder().add("*.period", 8)
//.add("test.sink.plugin.urls", getPluginUrlsAsString())
.add("test.sink.test.class", TestSink.class.getName())
.add("test.*.source.filter.exclude", "s0")
@ -93,8 +102,9 @@ public class TestMetricsSystemImpl {
MetricsSink sink2 = mock(MetricsSink.class);
ms.registerSink("sink1", "sink1 desc", sink1);
ms.registerSink("sink2", "sink2 desc", sink2);
ms.onTimerEvent(); // trigger something interesting
ms.publishMetricsNow(); // publish the metrics
ms.stop();
ms.shutdown();
verify(sink1, times(2)).putMetrics(r1.capture());
List<MetricsRecord> mr1 = r1.getAllValues();
@ -104,6 +114,177 @@ public class TestMetricsSystemImpl {
assertEquals("output", mr1, mr2);
}
@Test public void testMultiThreadedPublish() throws Exception {
new ConfigBuilder().add("*.period", 80)
.add("test.sink.Collector.queue.capacity", "20")
.save(TestMetricsConfig.getTestFilename("hadoop-metrics2-test"));
final MetricsSystemImpl ms = new MetricsSystemImpl("Test");
ms.start();
final int numThreads = 10;
final CollectingSink sink = new CollectingSink(numThreads);
ms.registerSink("Collector",
"Collector of values from all threads.", sink);
final TestSource[] sources = new TestSource[numThreads];
final Thread[] threads = new Thread[numThreads];
final String[] results = new String[numThreads];
final CyclicBarrier barrier1 = new CyclicBarrier(numThreads),
barrier2 = new CyclicBarrier(numThreads);
for (int i = 0; i < numThreads; i++) {
sources[i] = ms.register("threadSource" + i,
"A source of my threaded goodness.",
new TestSource("threadSourceRec" + i));
threads[i] = new Thread(new Runnable() {
private boolean safeAwait(int mySource, CyclicBarrier barrier) {
try {
barrier1.await(2, TimeUnit.SECONDS);
} catch (InterruptedException e) {
results[mySource] = "Interrupted";
return false;
} catch (BrokenBarrierException e) {
results[mySource] = "Broken Barrier";
return false;
} catch (TimeoutException e) {
results[mySource] = "Timed out on barrier";
return false;
}
return true;
}
@Override
public void run() {
int mySource = Integer.parseInt(Thread.currentThread().getName());
if (sink.collected[mySource].get() != 0L) {
results[mySource] = "Someone else collected my metric!";
return;
}
// Wait for all the threads to come here so we can hammer
// the system at the same time
if (!safeAwait(mySource, barrier1)) return;
sources[mySource].g1.set(230);
ms.publishMetricsNow();
// Since some other thread may have snatched my metric,
// I need to wait for the threads to finish before checking.
if (!safeAwait(mySource, barrier2)) return;
if (sink.collected[mySource].get() != 230L) {
results[mySource] = "Metric not collected!";
return;
}
results[mySource] = "Passed";
}
}, "" + i);
}
for (Thread t : threads)
t.start();
for (Thread t : threads)
t.join();
assertEquals(0L, ms.droppedPubAll.value());
assertTrue(StringUtils.join("\n", Arrays.asList(results)),
Iterables.all(Arrays.asList(results), new Predicate<String>() {
@Override
public boolean apply(@Nullable String input) {
return input.equalsIgnoreCase("Passed");
}
}));
ms.stop();
ms.shutdown();
}
private static class CollectingSink implements MetricsSink {
private final AtomicLong[] collected;
public CollectingSink(int capacity) {
collected = new AtomicLong[capacity];
for (int i = 0; i < capacity; i++) {
collected[i] = new AtomicLong();
}
}
@Override
public void init(SubsetConfiguration conf) {
}
@Override
public void putMetrics(MetricsRecord record) {
final String prefix = "threadSourceRec";
if (record.name().startsWith(prefix)) {
final int recordNumber = Integer.parseInt(
record.name().substring(prefix.length()));
ArrayList<String> names = new ArrayList<String>();
for (AbstractMetric m : record.metrics()) {
if (m.name().equalsIgnoreCase("g1")) {
collected[recordNumber].set(m.value().longValue());
return;
}
names.add(m.name());
}
}
}
@Override
public void flush() {
}
}
@Test public void testHangingSink() {
new ConfigBuilder().add("*.period", 8)
.add("test.sink.test.class", TestSink.class.getName())
.add("test.sink.hanging.retry.delay", "1")
.add("test.sink.hanging.retry.backoff", "1.01")
.add("test.sink.hanging.retry.count", "0")
.save(TestMetricsConfig.getTestFilename("hadoop-metrics2-test"));
MetricsSystemImpl ms = new MetricsSystemImpl("Test");
ms.start();
TestSource s = ms.register("s3", "s3 desc", new TestSource("s3rec"));
s.c1.incr();
HangingSink hanging = new HangingSink();
ms.registerSink("hanging", "Hang the sink!", hanging);
ms.publishMetricsNow();
assertEquals(1L, ms.droppedPubAll.value());
assertFalse(hanging.getInterrupted());
ms.stop();
ms.shutdown();
assertTrue(hanging.getInterrupted());
assertTrue("The sink didn't get called after its first hang " +
"for subsequent records.", hanging.getGotCalledSecondTime());
}
private static class HangingSink implements MetricsSink {
private volatile boolean interrupted;
private boolean gotCalledSecondTime;
private boolean firstTime = true;
public boolean getGotCalledSecondTime() {
return gotCalledSecondTime;
}
public boolean getInterrupted() {
return interrupted;
}
@Override
public void init(SubsetConfiguration conf) {
}
@Override
public void putMetrics(MetricsRecord record) {
// No need to hang every time, just the first record.
if (!firstTime) {
gotCalledSecondTime = true;
return;
}
firstTime = false;
try {
Thread.sleep(10 * 1000);
} catch (InterruptedException ex) {
interrupted = true;
}
}
@Override
public void flush() {
}
}
@Test public void testRegisterDups() {
MetricsSystem ms = new MetricsSystemImpl();
TestSource ts1 = new TestSource("ts1");
@ -116,6 +297,7 @@ public class TestMetricsSystemImpl {
MetricsSource s2 = ms.getSource("ts1");
assertNotNull(s2);
assertNotSame(s1, s2);
ms.shutdown();
}
@Test(expected=MetricsException.class) public void testRegisterDupError() {