HADOOP-16290. Enable RpcMetrics units to be configurable (#3198)
Signed-off-by: Akira Ajisaka <aajisaka@apache.org>
This commit is contained in:
parent
17bf2fcbc5
commit
e1d00addb5
|
@ -381,7 +381,9 @@ public class CommonConfigurationKeys extends CommonConfigurationKeysPublic {
|
||||||
public static final boolean RPC_METRICS_QUANTILE_ENABLE_DEFAULT = false;
|
public static final boolean RPC_METRICS_QUANTILE_ENABLE_DEFAULT = false;
|
||||||
public static final String RPC_METRICS_PERCENTILES_INTERVALS_KEY =
|
public static final String RPC_METRICS_PERCENTILES_INTERVALS_KEY =
|
||||||
"rpc.metrics.percentiles.intervals";
|
"rpc.metrics.percentiles.intervals";
|
||||||
|
|
||||||
|
public static final String RPC_METRICS_TIME_UNIT = "rpc.metrics.timeunit";
|
||||||
|
|
||||||
/** Allowed hosts for nfs exports */
|
/** Allowed hosts for nfs exports */
|
||||||
public static final String NFS_EXPORTS_ALLOWED_HOSTS_SEPARATOR = ";";
|
public static final String NFS_EXPORTS_ALLOWED_HOSTS_SEPARATOR = ";";
|
||||||
public static final String NFS_EXPORTS_ALLOWED_HOSTS_KEY = "nfs.exports.allowed.hosts";
|
public static final String NFS_EXPORTS_ALLOWED_HOSTS_KEY = "nfs.exports.allowed.hosts";
|
||||||
|
|
|
@ -193,6 +193,7 @@ public class DecayRpcScheduler implements RpcScheduler,
|
||||||
private final String namespace;
|
private final String namespace;
|
||||||
private final int topUsersCount; // e.g., report top 10 users' metrics
|
private final int topUsersCount; // e.g., report top 10 users' metrics
|
||||||
private static final double PRECISION = 0.0001;
|
private static final double PRECISION = 0.0001;
|
||||||
|
private final TimeUnit metricsTimeUnit;
|
||||||
private MetricsProxy metricsProxy;
|
private MetricsProxy metricsProxy;
|
||||||
private final CostProvider costProvider;
|
private final CostProvider costProvider;
|
||||||
private final Map<String, Integer> staticPriorities = new HashMap<>();
|
private final Map<String, Integer> staticPriorities = new HashMap<>();
|
||||||
|
@ -266,6 +267,8 @@ public class DecayRpcScheduler implements RpcScheduler,
|
||||||
DecayRpcSchedulerDetailedMetrics.create(ns);
|
DecayRpcSchedulerDetailedMetrics.create(ns);
|
||||||
decayRpcSchedulerDetailedMetrics.init(numLevels);
|
decayRpcSchedulerDetailedMetrics.init(numLevels);
|
||||||
|
|
||||||
|
metricsTimeUnit = RpcMetrics.getMetricsTimeUnit(conf);
|
||||||
|
|
||||||
// Setup delay timer
|
// Setup delay timer
|
||||||
Timer timer = new Timer(true);
|
Timer timer = new Timer(true);
|
||||||
DecayTask task = new DecayTask(this, timer);
|
DecayTask task = new DecayTask(this, timer);
|
||||||
|
@ -725,8 +728,9 @@ public class DecayRpcScheduler implements RpcScheduler,
|
||||||
addCost(user, processingCost);
|
addCost(user, processingCost);
|
||||||
|
|
||||||
int priorityLevel = schedulable.getPriorityLevel();
|
int priorityLevel = schedulable.getPriorityLevel();
|
||||||
long queueTime = details.get(Timing.QUEUE, RpcMetrics.TIMEUNIT);
|
long queueTime = details.get(Timing.QUEUE, metricsTimeUnit);
|
||||||
long processingTime = details.get(Timing.PROCESSING, RpcMetrics.TIMEUNIT);
|
long processingTime = details.get(Timing.PROCESSING,
|
||||||
|
metricsTimeUnit);
|
||||||
|
|
||||||
this.decayRpcSchedulerDetailedMetrics.addQueueTime(
|
this.decayRpcSchedulerDetailedMetrics.addQueueTime(
|
||||||
priorityLevel, queueTime);
|
priorityLevel, queueTime);
|
||||||
|
|
|
@ -62,10 +62,10 @@ public interface RpcScheduler {
|
||||||
// this interface, a default implementation is supplied which uses the old
|
// this interface, a default implementation is supplied which uses the old
|
||||||
// method. All new implementations MUST override this interface and should
|
// method. All new implementations MUST override this interface and should
|
||||||
// NOT use the other addResponseTime method.
|
// NOT use the other addResponseTime method.
|
||||||
int queueTime = (int)
|
int queueTime = (int) details.get(ProcessingDetails.Timing.QUEUE,
|
||||||
details.get(ProcessingDetails.Timing.QUEUE, RpcMetrics.TIMEUNIT);
|
RpcMetrics.DEFAULT_METRIC_TIME_UNIT);
|
||||||
int processingTime = (int)
|
int processingTime = (int) details.get(ProcessingDetails.Timing.PROCESSING,
|
||||||
details.get(ProcessingDetails.Timing.PROCESSING, RpcMetrics.TIMEUNIT);
|
RpcMetrics.DEFAULT_METRIC_TIME_UNIT);
|
||||||
addResponseTime(callName, schedulable.getPriorityLevel(),
|
addResponseTime(callName, schedulable.getPriorityLevel(),
|
||||||
queueTime, processingTime);
|
queueTime, processingTime);
|
||||||
}
|
}
|
||||||
|
|
|
@ -544,13 +544,13 @@ public abstract class Server {
|
||||||
(rpcMetrics.getProcessingStdDev() * deviation);
|
(rpcMetrics.getProcessingStdDev() * deviation);
|
||||||
|
|
||||||
long processingTime =
|
long processingTime =
|
||||||
details.get(Timing.PROCESSING, RpcMetrics.TIMEUNIT);
|
details.get(Timing.PROCESSING, rpcMetrics.getMetricsTimeUnit());
|
||||||
if ((rpcMetrics.getProcessingSampleCount() > minSampleSize) &&
|
if ((rpcMetrics.getProcessingSampleCount() > minSampleSize) &&
|
||||||
(processingTime > threeSigma)) {
|
(processingTime > threeSigma)) {
|
||||||
LOG.warn(
|
LOG.warn(
|
||||||
"Slow RPC : {} took {} {} to process from client {},"
|
"Slow RPC : {} took {} {} to process from client {},"
|
||||||
+ " the processing detail is {}",
|
+ " the processing detail is {}",
|
||||||
methodName, processingTime, RpcMetrics.TIMEUNIT, call,
|
methodName, processingTime, rpcMetrics.getMetricsTimeUnit(), call,
|
||||||
details.toString());
|
details.toString());
|
||||||
rpcMetrics.incrSlowRpc();
|
rpcMetrics.incrSlowRpc();
|
||||||
}
|
}
|
||||||
|
@ -570,7 +570,7 @@ public abstract class Server {
|
||||||
deltaNanos -= details.get(Timing.RESPONSE);
|
deltaNanos -= details.get(Timing.RESPONSE);
|
||||||
details.set(Timing.HANDLER, deltaNanos);
|
details.set(Timing.HANDLER, deltaNanos);
|
||||||
|
|
||||||
long queueTime = details.get(Timing.QUEUE, RpcMetrics.TIMEUNIT);
|
long queueTime = details.get(Timing.QUEUE, rpcMetrics.getMetricsTimeUnit());
|
||||||
rpcMetrics.addRpcQueueTime(queueTime);
|
rpcMetrics.addRpcQueueTime(queueTime);
|
||||||
|
|
||||||
if (call.isResponseDeferred() || connDropped) {
|
if (call.isResponseDeferred() || connDropped) {
|
||||||
|
@ -579,9 +579,9 @@ public abstract class Server {
|
||||||
}
|
}
|
||||||
|
|
||||||
long processingTime =
|
long processingTime =
|
||||||
details.get(Timing.PROCESSING, RpcMetrics.TIMEUNIT);
|
details.get(Timing.PROCESSING, rpcMetrics.getMetricsTimeUnit());
|
||||||
long waitTime =
|
long waitTime =
|
||||||
details.get(Timing.LOCKWAIT, RpcMetrics.TIMEUNIT);
|
details.get(Timing.LOCKWAIT, rpcMetrics.getMetricsTimeUnit());
|
||||||
rpcMetrics.addRpcLockWaitTime(waitTime);
|
rpcMetrics.addRpcLockWaitTime(waitTime);
|
||||||
rpcMetrics.addRpcProcessingTime(processingTime);
|
rpcMetrics.addRpcProcessingTime(processingTime);
|
||||||
// don't include lock wait for detailed metrics.
|
// don't include lock wait for detailed metrics.
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.hadoop.ipc.metrics;
|
||||||
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
|
import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.hadoop.fs.CommonConfigurationKeys;
|
import org.apache.hadoop.fs.CommonConfigurationKeys;
|
||||||
import org.apache.hadoop.ipc.Server;
|
import org.apache.hadoop.ipc.Server;
|
||||||
|
@ -48,9 +49,12 @@ public class RpcMetrics {
|
||||||
final MetricsRegistry registry;
|
final MetricsRegistry registry;
|
||||||
final String name;
|
final String name;
|
||||||
final boolean rpcQuantileEnable;
|
final boolean rpcQuantileEnable;
|
||||||
|
|
||||||
|
public static final TimeUnit DEFAULT_METRIC_TIME_UNIT =
|
||||||
|
TimeUnit.MILLISECONDS;
|
||||||
/** The time unit used when storing/accessing time durations. */
|
/** The time unit used when storing/accessing time durations. */
|
||||||
public final static TimeUnit TIMEUNIT = TimeUnit.MILLISECONDS;
|
private final TimeUnit metricsTimeUnit;
|
||||||
|
|
||||||
RpcMetrics(Server server, Configuration conf) {
|
RpcMetrics(Server server, Configuration conf) {
|
||||||
String port = String.valueOf(server.getListenerAddress().getPort());
|
String port = String.valueOf(server.getListenerAddress().getPort());
|
||||||
name = "RpcActivityForPort" + port;
|
name = "RpcActivityForPort" + port;
|
||||||
|
@ -63,6 +67,7 @@ public class RpcMetrics {
|
||||||
rpcQuantileEnable = (intervals.length > 0) && conf.getBoolean(
|
rpcQuantileEnable = (intervals.length > 0) && conf.getBoolean(
|
||||||
CommonConfigurationKeys.RPC_METRICS_QUANTILE_ENABLE,
|
CommonConfigurationKeys.RPC_METRICS_QUANTILE_ENABLE,
|
||||||
CommonConfigurationKeys.RPC_METRICS_QUANTILE_ENABLE_DEFAULT);
|
CommonConfigurationKeys.RPC_METRICS_QUANTILE_ENABLE_DEFAULT);
|
||||||
|
metricsTimeUnit = getMetricsTimeUnit(conf);
|
||||||
if (rpcQuantileEnable) {
|
if (rpcQuantileEnable) {
|
||||||
rpcQueueTimeQuantiles =
|
rpcQueueTimeQuantiles =
|
||||||
new MutableQuantiles[intervals.length];
|
new MutableQuantiles[intervals.length];
|
||||||
|
@ -75,19 +80,19 @@ public class RpcMetrics {
|
||||||
for (int i = 0; i < intervals.length; i++) {
|
for (int i = 0; i < intervals.length; i++) {
|
||||||
int interval = intervals[i];
|
int interval = intervals[i];
|
||||||
rpcQueueTimeQuantiles[i] = registry.newQuantiles("rpcQueueTime"
|
rpcQueueTimeQuantiles[i] = registry.newQuantiles("rpcQueueTime"
|
||||||
+ interval + "s", "rpc queue time in " + TIMEUNIT, "ops",
|
+ interval + "s", "rpc queue time in " + metricsTimeUnit, "ops",
|
||||||
"latency", interval);
|
"latency", interval);
|
||||||
rpcLockWaitTimeQuantiles[i] = registry.newQuantiles(
|
rpcLockWaitTimeQuantiles[i] = registry.newQuantiles(
|
||||||
"rpcLockWaitTime" + interval + "s",
|
"rpcLockWaitTime" + interval + "s",
|
||||||
"rpc lock wait time in " + TIMEUNIT, "ops",
|
"rpc lock wait time in " + metricsTimeUnit, "ops",
|
||||||
"latency", interval);
|
"latency", interval);
|
||||||
rpcProcessingTimeQuantiles[i] = registry.newQuantiles(
|
rpcProcessingTimeQuantiles[i] = registry.newQuantiles(
|
||||||
"rpcProcessingTime" + interval + "s",
|
"rpcProcessingTime" + interval + "s",
|
||||||
"rpc processing time in " + TIMEUNIT, "ops",
|
"rpc processing time in " + metricsTimeUnit, "ops",
|
||||||
"latency", interval);
|
"latency", interval);
|
||||||
deferredRpcProcessingTimeQuantiles[i] = registry.newQuantiles(
|
deferredRpcProcessingTimeQuantiles[i] = registry.newQuantiles(
|
||||||
"deferredRpcProcessingTime" + interval + "s",
|
"deferredRpcProcessingTime" + interval + "s",
|
||||||
"deferred rpc processing time in " + TIMEUNIT, "ops",
|
"deferred rpc processing time in " + metricsTimeUnit, "ops",
|
||||||
"latency", interval);
|
"latency", interval);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -141,6 +146,27 @@ public class RpcMetrics {
|
||||||
return server.getNumDroppedConnections();
|
return server.getNumDroppedConnections();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public TimeUnit getMetricsTimeUnit() {
|
||||||
|
return metricsTimeUnit;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static TimeUnit getMetricsTimeUnit(Configuration conf) {
|
||||||
|
TimeUnit metricsTimeUnit = RpcMetrics.DEFAULT_METRIC_TIME_UNIT;
|
||||||
|
String timeunit = conf.get(CommonConfigurationKeys.RPC_METRICS_TIME_UNIT);
|
||||||
|
if (StringUtils.isNotEmpty(timeunit)) {
|
||||||
|
try {
|
||||||
|
metricsTimeUnit = TimeUnit.valueOf(timeunit);
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
LOG.info("Config key {} 's value {} does not correspond to enum values"
|
||||||
|
+ " of java.util.concurrent.TimeUnit. Hence default unit"
|
||||||
|
+ " {} will be used",
|
||||||
|
CommonConfigurationKeys.RPC_METRICS_TIME_UNIT, timeunit,
|
||||||
|
RpcMetrics.DEFAULT_METRIC_TIME_UNIT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return metricsTimeUnit;
|
||||||
|
}
|
||||||
|
|
||||||
// Public instrumentation methods that could be extracted to an
|
// Public instrumentation methods that could be extracted to an
|
||||||
// abstract class if we decide to do custom instrumentation classes a la
|
// abstract class if we decide to do custom instrumentation classes a la
|
||||||
// JobTrackerInstrumentation. The methods with //@Override comment are
|
// JobTrackerInstrumentation. The methods with //@Override comment are
|
||||||
|
|
|
@ -3345,6 +3345,21 @@
|
||||||
</description>
|
</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>rpc.metrics.timeunit</name>
|
||||||
|
<value>MILLISECONDS</value>
|
||||||
|
<description>
|
||||||
|
This property is used to configure timeunit for various RPC Metrics
|
||||||
|
e.g rpcQueueTime, rpcLockWaitTime, rpcProcessingTime,
|
||||||
|
deferredRpcProcessingTime. In the absence of this property,
|
||||||
|
default timeunit used is milliseconds.
|
||||||
|
The value of this property should match to any one value of enum:
|
||||||
|
java.util.concurrent.TimeUnit.
|
||||||
|
Some of the valid values: NANOSECONDS, MICROSECONDS, MILLISECONDS,
|
||||||
|
SECONDS etc.
|
||||||
|
</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>rpc.metrics.percentiles.intervals</name>
|
<name>rpc.metrics.percentiles.intervals</name>
|
||||||
<value></value>
|
<value></value>
|
||||||
|
|
|
@ -65,6 +65,8 @@ rpc
|
||||||
---
|
---
|
||||||
|
|
||||||
Each metrics record contains tags such as Hostname and port (number to which server is bound) as additional information along with metrics.
|
Each metrics record contains tags such as Hostname and port (number to which server is bound) as additional information along with metrics.
|
||||||
|
`rpc.metrics.timeunit` config can be used to configure timeunit for RPC metrics.
|
||||||
|
The default timeunit used for RPC metrics is milliseconds (as per the below description).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|:---- |:---- |
|
|:---- |:---- |
|
||||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.hadoop.io.retry.RetryProxy;
|
||||||
import org.apache.hadoop.ipc.Client.ConnectionId;
|
import org.apache.hadoop.ipc.Client.ConnectionId;
|
||||||
import org.apache.hadoop.ipc.Server.Call;
|
import org.apache.hadoop.ipc.Server.Call;
|
||||||
import org.apache.hadoop.ipc.Server.Connection;
|
import org.apache.hadoop.ipc.Server.Connection;
|
||||||
import org.apache.hadoop.ipc.metrics.RpcMetrics;
|
|
||||||
import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto.RpcErrorCodeProto;
|
import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto.RpcErrorCodeProto;
|
||||||
import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto.RpcStatusProto;
|
import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto.RpcStatusProto;
|
||||||
import org.apache.hadoop.ipc.protobuf.TestProtos;
|
import org.apache.hadoop.ipc.protobuf.TestProtos;
|
||||||
|
@ -1098,8 +1097,8 @@ public class TestRPC extends TestRpcBase {
|
||||||
proxy.lockAndSleep(null, newSleepRequest(5));
|
proxy.lockAndSleep(null, newSleepRequest(5));
|
||||||
rpcMetrics = getMetrics(server.getRpcMetrics().name());
|
rpcMetrics = getMetrics(server.getRpcMetrics().name());
|
||||||
assertGauge("RpcLockWaitTimeAvgTime",
|
assertGauge("RpcLockWaitTimeAvgTime",
|
||||||
(double)(RpcMetrics.TIMEUNIT.convert(10L, TimeUnit.SECONDS)),
|
(double)(server.getRpcMetrics().getMetricsTimeUnit().convert(10L,
|
||||||
rpcMetrics);
|
TimeUnit.SECONDS)), rpcMetrics);
|
||||||
} finally {
|
} finally {
|
||||||
if (proxy2 != null) {
|
if (proxy2 != null) {
|
||||||
RPC.stopProxy(proxy2);
|
RPC.stopProxy(proxy2);
|
||||||
|
@ -1603,6 +1602,70 @@ public class TestRPC extends TestRpcBase {
|
||||||
assertTrue(rpcEngine instanceof StoppedRpcEngine);
|
assertTrue(rpcEngine instanceof StoppedRpcEngine);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRpcMetricsInNanos() throws Exception {
|
||||||
|
final Server server;
|
||||||
|
TestRpcService proxy = null;
|
||||||
|
|
||||||
|
final int interval = 1;
|
||||||
|
conf.setBoolean(CommonConfigurationKeys.
|
||||||
|
RPC_METRICS_QUANTILE_ENABLE, true);
|
||||||
|
conf.set(CommonConfigurationKeys.
|
||||||
|
RPC_METRICS_PERCENTILES_INTERVALS_KEY, "" + interval);
|
||||||
|
conf.set(CommonConfigurationKeys.RPC_METRICS_TIME_UNIT, "NANOSECONDS");
|
||||||
|
|
||||||
|
server = setupTestServer(conf, 5);
|
||||||
|
String testUser = "testUserInNanos";
|
||||||
|
UserGroupInformation anotherUser =
|
||||||
|
UserGroupInformation.createRemoteUser(testUser);
|
||||||
|
TestRpcService proxy2 =
|
||||||
|
anotherUser.doAs((PrivilegedAction<TestRpcService>) () -> {
|
||||||
|
try {
|
||||||
|
return RPC.getProxy(TestRpcService.class, 0,
|
||||||
|
server.getListenerAddress(), conf);
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.error("Something went wrong.", e);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
try {
|
||||||
|
proxy = getClient(addr, conf);
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
proxy.ping(null, newEmptyRequest());
|
||||||
|
proxy.echo(null, newEchoRequest("" + i));
|
||||||
|
proxy2.echo(null, newEchoRequest("" + i));
|
||||||
|
}
|
||||||
|
MetricsRecordBuilder rpcMetrics =
|
||||||
|
getMetrics(server.getRpcMetrics().name());
|
||||||
|
assertEquals("Expected zero rpc lock wait time",
|
||||||
|
0, getDoubleGauge("RpcLockWaitTimeAvgTime", rpcMetrics), 0.001);
|
||||||
|
MetricsAsserts.assertQuantileGauges("RpcQueueTime" + interval + "s",
|
||||||
|
rpcMetrics);
|
||||||
|
MetricsAsserts.assertQuantileGauges("RpcProcessingTime" + interval + "s",
|
||||||
|
rpcMetrics);
|
||||||
|
|
||||||
|
proxy.lockAndSleep(null, newSleepRequest(5));
|
||||||
|
rpcMetrics = getMetrics(server.getRpcMetrics().name());
|
||||||
|
assertGauge("RpcLockWaitTimeAvgTime",
|
||||||
|
(double)(server.getRpcMetrics().getMetricsTimeUnit().convert(10L,
|
||||||
|
TimeUnit.SECONDS)), rpcMetrics);
|
||||||
|
LOG.info("RpcProcessingTimeAvgTime: {} , RpcQueueTimeAvgTime: {}",
|
||||||
|
getDoubleGauge("RpcProcessingTimeAvgTime", rpcMetrics),
|
||||||
|
getDoubleGauge("RpcQueueTimeAvgTime", rpcMetrics));
|
||||||
|
|
||||||
|
assertTrue(getDoubleGauge("RpcProcessingTimeAvgTime", rpcMetrics)
|
||||||
|
> 4000000D);
|
||||||
|
assertTrue(getDoubleGauge("RpcQueueTimeAvgTime", rpcMetrics)
|
||||||
|
> 4000D);
|
||||||
|
} finally {
|
||||||
|
if (proxy2 != null) {
|
||||||
|
RPC.stopProxy(proxy2);
|
||||||
|
}
|
||||||
|
stop(server, proxy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
new TestRPC().testCallsInternal(conf);
|
new TestRPC().testCallsInternal(conf);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue