HADOOP-12325. RPC Metrics : Add the ability track and log slow RPCs. Contributed by Anu Engineer

This commit is contained in:
Xiaoyu Yao 2015-08-24 14:31:24 -07:00
parent b5ce87f84d
commit 48774d0a45
12 changed files with 223 additions and 4 deletions

View File

@ -753,6 +753,9 @@ Release 2.8.0 - UNRELEASED
HADOOP-12050. Enable MaxInactiveInterval for hadoop http auth token HADOOP-12050. Enable MaxInactiveInterval for hadoop http auth token
(hzlu via benoyantony) (hzlu via benoyantony)
HADOOP-12325. RPC Metrics : Add the ability track and log slow RPCs.
(Anu Engineer via xyao)
OPTIMIZATIONS OPTIMIZATIONS
HADOOP-11785. Reduce the number of listStatus operation in distcp HADOOP-11785. Reduce the number of listStatus operation in distcp

View File

@ -235,6 +235,11 @@ public class CommonConfigurationKeysPublic {
/** Default value for IPC_SERVER_MAX_CONNECTIONS_KEY */ /** Default value for IPC_SERVER_MAX_CONNECTIONS_KEY */
public static final int IPC_SERVER_MAX_CONNECTIONS_DEFAULT = 0; public static final int IPC_SERVER_MAX_CONNECTIONS_DEFAULT = 0;
/** Logs if a RPC is really slow compared to rest of RPCs. */
public static final String IPC_SERVER_LOG_SLOW_RPC =
"ipc.server.log.slow.rpc";
public static final boolean IPC_SERVER_LOG_SLOW_RPC_DEFAULT = false;
/** See <a href="{@docRoot}/../core-default.html">core-default.xml</a> */ /** See <a href="{@docRoot}/../core-default.html">core-default.xml</a> */
public static final String HADOOP_RPC_SOCKET_FACTORY_CLASS_DEFAULT_KEY = public static final String HADOOP_RPC_SOCKET_FACTORY_CLASS_DEFAULT_KEY =
"hadoop.rpc.socket.factory.class.default"; "hadoop.rpc.socket.factory.class.default";

View File

@ -567,7 +567,7 @@ private static ProtoClassProtoImpl getProtocolImpl(RPC.Server server,
/** /**
* This is a server side method, which is invoked over RPC. On success * This is a server side method, which is invoked over RPC. On success
* the return response has protobuf response payload. On failure, the * the return response has protobuf response payload. On failure, the
* exception name and the stack trace are return in the resposne. * exception name and the stack trace are returned in the response.
* See {@link HadoopRpcResponseProto} * See {@link HadoopRpcResponseProto}
* *
* In this method there three types of exceptions possible and they are * In this method there three types of exceptions possible and they are
@ -657,6 +657,9 @@ public Writable call(RPC.Server server, String connectionProtocolName,
server.rpcMetrics.addRpcProcessingTime(processingTime); server.rpcMetrics.addRpcProcessingTime(processingTime);
server.rpcDetailedMetrics.addProcessingTime(detailedMetricsName, server.rpcDetailedMetrics.addProcessingTime(detailedMetricsName,
processingTime); processingTime);
if (server.isLogSlowRPC()) {
server.logSlowRpcCalls(methodName, processingTime);
}
} }
return new RpcResponseWrapper(result); return new RpcResponseWrapper(result);
} }

View File

@ -387,6 +387,62 @@ public static boolean isRpcInvocation() {
private Responder responder = null; private Responder responder = null;
private Handler[] handlers = null; private Handler[] handlers = null;
private boolean logSlowRPC = false;
/**
* Checks if LogSlowRPC is set true.
* @return
*/
protected boolean isLogSlowRPC() {
return logSlowRPC;
}
/**
* Sets slow RPC flag.
* @param logSlowRPCFlag
*/
@VisibleForTesting
protected void setLogSlowRPC(boolean logSlowRPCFlag) {
this.logSlowRPC = logSlowRPCFlag;
}
/**
* Logs a Slow RPC Request.
*
* @param methodName - RPC Request method name
* @param processingTime - Processing Time.
*
* if this request took too much time relative to other requests
* we consider that as a slow RPC. 3 is a magic number that comes
* from 3 sigma deviation. A very simple explanation can be found
* by searching for 689599.7 rule. We flag an RPC as slow RPC
* if and only if it falls above 99.7% of requests. We start this logic
* only once we have enough sample size.
*/
void logSlowRpcCalls(String methodName, int processingTime) {
final int deviation = 3;
// 1024 for minSampleSize just a guess -- not a number computed based on
// sample size analysis. It is chosen with the hope that this
// number is high enough to avoid spurious logging, yet useful
// in practice.
final int minSampleSize = 1024;
final double threeSigma = rpcMetrics.getProcessingMean() +
(rpcMetrics.getProcessingStdDev() * deviation);
if ((rpcMetrics.getProcessingSampleCount() > minSampleSize) &&
(processingTime > threeSigma)) {
if(LOG.isWarnEnabled()) {
String client = CurCall.get().connection.toString();
LOG.warn(
"Slow RPC : " + methodName + " took " + processingTime +
" milliseconds to process from client " + client);
}
rpcMetrics.incrSlowRpc();
}
}
/** /**
* A convenience method to bind to a given address and report * A convenience method to bind to a given address and report
* better exceptions if the address is not a valid host. * better exceptions if the address is not a valid host.
@ -2346,6 +2402,10 @@ protected Server(String bindAddress, int port,
CommonConfigurationKeysPublic.IPC_SERVER_TCPNODELAY_KEY, CommonConfigurationKeysPublic.IPC_SERVER_TCPNODELAY_KEY,
CommonConfigurationKeysPublic.IPC_SERVER_TCPNODELAY_DEFAULT); CommonConfigurationKeysPublic.IPC_SERVER_TCPNODELAY_DEFAULT);
this.setLogSlowRPC(conf.getBoolean(
CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC,
CommonConfigurationKeysPublic.IPC_SERVER_LOG_SLOW_RPC_DEFAULT));
// Create the responder here // Create the responder here
responder = new Responder(); responder = new Responder();

View File

@ -551,6 +551,9 @@ public Writable call(org.apache.hadoop.ipc.RPC.Server server,
server.rpcMetrics.addRpcProcessingTime(processingTime); server.rpcMetrics.addRpcProcessingTime(processingTime);
server.rpcDetailedMetrics.addProcessingTime(detailedMetricsName, server.rpcDetailedMetrics.addProcessingTime(detailedMetricsName,
processingTime); processingTime);
if (server.isLogSlowRPC()) {
server.logSlowRpcCalls(call.getMethodName(), processingTime);
}
} }
} }
} }

View File

@ -97,6 +97,8 @@ public static RpcMetrics create(Server server, Configuration conf) {
MutableCounterLong rpcAuthorizationSuccesses; MutableCounterLong rpcAuthorizationSuccesses;
@Metric("Number of client backoff requests") @Metric("Number of client backoff requests")
MutableCounterLong rpcClientBackoff; MutableCounterLong rpcClientBackoff;
@Metric("Number of Slow RPC calls")
MutableCounterLong rpcSlowCalls;
@Metric("Number of open connections") public int numOpenConnections() { @Metric("Number of open connections") public int numOpenConnections() {
return server.getNumOpenConnections(); return server.getNumOpenConnections();
@ -202,4 +204,50 @@ public void addRpcProcessingTime(int processingTime) {
public void incrClientBackoff() { public void incrClientBackoff() {
rpcClientBackoff.incr(); rpcClientBackoff.incr();
} }
/**
* Increments the Slow RPC counter.
*/
public void incrSlowRpc() {
rpcSlowCalls.incr();
}
/**
* Returns a MutableRate Counter.
* @return Mutable Rate
*/
public MutableRate getRpcProcessingTime() {
return rpcProcessingTime;
}
/**
* Returns the number of samples that we have seen so far.
* @return long
*/
public long getProcessingSampleCount() {
return rpcProcessingTime.lastStat().numSamples();
}
/**
* Returns mean of RPC Processing Times.
* @return double
*/
public double getProcessingMean() {
return rpcProcessingTime.lastStat().mean();
}
/**
* Return Standard Deviation of the Processing Time.
* @return double
*/
public double getProcessingStdDev() {
return rpcProcessingTime.lastStat().stddev();
}
/**
* Returns the number of slow calls.
* @return long
*/
public long getRpcSlowCalls() {
return rpcSlowCalls.value();
}
} }

View File

@ -140,7 +140,12 @@ public synchronized void snapshot(MetricsRecordBuilder builder, boolean all) {
} }
} }
private SampleStat lastStat() { /**
* Return a SampleStat object that supports
* calls like StdDev and Mean.
* @return SampleStat
*/
public SampleStat lastStat() {
return changed() ? intervalStat : prevStat; return changed() ? intervalStat : prevStat;
} }

View File

@ -1032,6 +1032,15 @@ for ldap providers in the same way as above does.
</description> </description>
</property> </property>
<property>
<name>ipc.server.log.slow.rpc</name>
<value>false</value>
<description>This setting is useful to troubleshoot performance issues for
various services. If this value is set to true then we log requests that
fall into 99th percentile as well as increment RpcSlowCalls counter.
</description>
</property>
<!-- Proxy Configuration --> <!-- Proxy Configuration -->
<property> <property>

View File

@ -19,6 +19,8 @@
import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
import static org.apache.hadoop.test.MetricsAsserts.assertCounterGt; import static org.apache.hadoop.test.MetricsAsserts.assertCounterGt;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException; import java.io.IOException;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
@ -27,7 +29,9 @@
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.ipc.metrics.RpcMetrics;
import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto.RpcErrorCodeProto; import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto.RpcErrorCodeProto;
import org.apache.hadoop.ipc.protobuf.TestProtos;
import org.apache.hadoop.ipc.protobuf.TestProtos.EchoRequestProto; import org.apache.hadoop.ipc.protobuf.TestProtos.EchoRequestProto;
import org.apache.hadoop.ipc.protobuf.TestProtos.EchoResponseProto; import org.apache.hadoop.ipc.protobuf.TestProtos.EchoResponseProto;
import org.apache.hadoop.ipc.protobuf.TestProtos.EmptyRequestProto; import org.apache.hadoop.ipc.protobuf.TestProtos.EmptyRequestProto;
@ -41,6 +45,7 @@
import org.junit.Before; import org.junit.Before;
import org.junit.After; import org.junit.After;
import com.google.protobuf.BlockingService; import com.google.protobuf.BlockingService;
import com.google.protobuf.RpcController; import com.google.protobuf.RpcController;
import com.google.protobuf.ServiceException; import com.google.protobuf.ServiceException;
@ -56,7 +61,8 @@ public class TestProtoBufRpc {
private static InetSocketAddress addr; private static InetSocketAddress addr;
private static Configuration conf; private static Configuration conf;
private static RPC.Server server; private static RPC.Server server;
private final static int SLEEP_DURATION = 1000;
@ProtocolInfo(protocolName = "testProto", protocolVersion = 1) @ProtocolInfo(protocolName = "testProto", protocolVersion = 1)
public interface TestRpcService public interface TestRpcService
extends TestProtobufRpcProto.BlockingInterface { extends TestProtobufRpcProto.BlockingInterface {
@ -114,12 +120,23 @@ public EchoResponseProto echo2(RpcController unused, EchoRequestProto request)
return EchoResponseProto.newBuilder().setMessage(request.getMessage()) return EchoResponseProto.newBuilder().setMessage(request.getMessage())
.build(); .build();
} }
@Override
public TestProtos.SleepResponseProto sleep(RpcController controller,
TestProtos.SleepRequestProto request) throws ServiceException {
try{
Thread.sleep(request.getMilliSeconds());
} catch (InterruptedException ex){
}
return TestProtos.SleepResponseProto.newBuilder().build();
}
} }
@Before @Before
public void setUp() throws IOException { // Setup server for both protocols public void setUp() throws IOException { // Setup server for both protocols
conf = new Configuration(); conf = new Configuration();
conf.setInt(CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH, 1024); conf.setInt(CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH, 1024);
conf.setBoolean(CommonConfigurationKeys.IPC_SERVER_LOG_SLOW_RPC, true);
// Set RPC engine to protobuf RPC engine // Set RPC engine to protobuf RPC engine
RPC.setProtocolEngine(conf, TestRpcService.class, ProtobufRpcEngine.class); RPC.setProtocolEngine(conf, TestRpcService.class, ProtobufRpcEngine.class);
@ -257,4 +274,62 @@ public void testExtraLongRpc() throws Exception {
// expected // expected
} }
} }
@Test(timeout = 12000)
public void testLogSlowRPC() throws IOException, ServiceException {
TestRpcService2 client = getClient2();
// make 10 K fast calls
for (int x = 0; x < 10000; x++) {
try {
EmptyRequestProto emptyRequest = EmptyRequestProto.newBuilder().build();
client.ping2(null, emptyRequest);
} catch (Exception ex) {
throw ex;
}
}
// Ensure RPC metrics are updated
RpcMetrics rpcMetrics = server.getRpcMetrics();
assertTrue(rpcMetrics.getProcessingSampleCount() > 999L);
long before = rpcMetrics.getRpcSlowCalls();
// make a really slow call. Sleep sleeps for 1000ms
TestProtos.SleepRequestProto sleepRequest =
TestProtos.SleepRequestProto.newBuilder()
.setMilliSeconds(SLEEP_DURATION * 3).build();
TestProtos.SleepResponseProto Response = client.sleep(null, sleepRequest);
long after = rpcMetrics.getRpcSlowCalls();
// Ensure slow call is logged.
Assert.assertEquals(before + 1L, after);
}
@Test(timeout = 12000)
public void testEnsureNoLogIfDisabled() throws IOException, ServiceException {
// disable slow RPC logging
server.setLogSlowRPC(false);
TestRpcService2 client = getClient2();
// make 10 K fast calls
for (int x = 0; x < 10000; x++) {
EmptyRequestProto emptyRequest = EmptyRequestProto.newBuilder().build();
client.ping2(null, emptyRequest);
}
// Ensure RPC metrics are updated
RpcMetrics rpcMetrics = server.getRpcMetrics();
assertTrue(rpcMetrics.getProcessingSampleCount() > 999L);
long before = rpcMetrics.getRpcSlowCalls();
// make a really slow call. Sleep sleeps for 1000ms
TestProtos.SleepRequestProto sleepRequest =
TestProtos.SleepRequestProto.newBuilder()
.setMilliSeconds(SLEEP_DURATION).build();
TestProtos.SleepResponseProto Response = client.sleep(null, sleepRequest);
long after = rpcMetrics.getRpcSlowCalls();
// make sure we never called into Log slow RPC routine.
assertEquals(before, after);
}
} }

View File

@ -87,7 +87,7 @@ public Object answer(InvocationOnMock invocation) {
* Call getMetrics on source and get a record builder mock to verify * Call getMetrics on source and get a record builder mock to verify
* @param source the metrics source * @param source the metrics source
* @param all if true, return all metrics even if not changed * @param all if true, return all metrics even if not changed
* @return the record builder mock to verify * @return the record builder mock to verifyÏ
*/ */
public static MetricsRecordBuilder getMetrics(MetricsSource source, public static MetricsRecordBuilder getMetrics(MetricsSource source,
boolean all) { boolean all) {

View File

@ -34,3 +34,10 @@ message EchoRequestProto {
message EchoResponseProto { message EchoResponseProto {
required string message = 1; required string message = 1;
} }
message SleepRequestProto{
required int32 milliSeconds = 1;
}
message SleepResponseProto{
}

View File

@ -37,4 +37,5 @@ service TestProtobufRpcProto {
service TestProtobufRpc2Proto { service TestProtobufRpc2Proto {
rpc ping2(EmptyRequestProto) returns (EmptyResponseProto); rpc ping2(EmptyRequestProto) returns (EmptyResponseProto);
rpc echo2(EchoRequestProto) returns (EchoResponseProto); rpc echo2(EchoRequestProto) returns (EchoResponseProto);
rpc sleep(SleepRequestProto) returns (SleepResponseProto);
} }