From 0747f0c934a338ac013eca7bc8894c2815cada4d Mon Sep 17 00:00:00 2001 From: Victor Date: Fri, 24 Feb 2023 16:25:01 -0800 Subject: [PATCH] HBASE-15242: add client side metrics for timeout and remote exceptions. (#5023) Signed-off-by: Andrew Purtell --- .../hbase/client/MetricsConnection.java | 19 +++++++- .../hadoop/hbase/ipc/AbstractRpcClient.java | 7 ++- .../hbase/client/TestMetricsConnection.java | 48 +++++++++++++++---- 3 files changed, 58 insertions(+), 16 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java index 302cb283225..0f9b5a5ee86 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java @@ -37,6 +37,7 @@ import java.util.function.Supplier; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.ipc.RemoteException; import org.apache.yetus.audience.InterfaceAudience; import org.apache.hbase.thirdparty.com.google.protobuf.Descriptors.MethodDescriptor; @@ -118,6 +119,9 @@ public final class MetricsConnection implements StatisticTrackable { private static final String CNT_BASE = "rpcCount_"; private static final String FAILURE_CNT_BASE = "rpcFailureCount_"; + private static final String TOTAL_EXCEPTION_CNT = "rpcTotalExceptions"; + private static final String LOCAL_EXCEPTION_CNT_BASE = "rpcLocalExceptions_"; + private static final String REMOTE_EXCEPTION_CNT_BASE = "rpcRemoteExceptions_"; private static final String DRTN_BASE = "rpcCallDurationMs_"; private static final String REQ_BASE = "rpcCallRequestSizeBytes_"; private static final String RESP_BASE = "rpcCallResponseSizeBytes_"; @@ -638,7 +642,7 @@ public final class MetricsConnection implements StatisticTrackable { } /** Report RPC context to metrics system. */ - public void updateRpc(MethodDescriptor method, Message param, CallStats stats, boolean failed) { + public void updateRpc(MethodDescriptor method, Message param, CallStats stats, Throwable e) { int callsPerServer = stats.getConcurrentCallsPerServer(); if (callsPerServer > 0) { concurrentCallsPerServerHist.update(callsPerServer); @@ -646,8 +650,19 @@ public final class MetricsConnection implements StatisticTrackable { // Update the counter that tracks RPCs by type. final String methodName = method.getService().getName() + "_" + method.getName(); getMetric(CNT_BASE + methodName, rpcCounters, counterFactory).inc(); - if (failed) { + if (e != null) { getMetric(FAILURE_CNT_BASE + methodName, rpcCounters, counterFactory).inc(); + getMetric(TOTAL_EXCEPTION_CNT, rpcCounters, counterFactory).inc(); + if (e instanceof RemoteException) { + String fullClassName = ((RemoteException) e).getClassName(); + String simpleClassName = (fullClassName != null) + ? fullClassName.substring(fullClassName.lastIndexOf(".") + 1) + : "unknown"; + getMetric(REMOTE_EXCEPTION_CNT_BASE + simpleClassName, rpcCounters, counterFactory).inc(); + } else { + getMetric(LOCAL_EXCEPTION_CNT_BASE + e.getClass().getSimpleName(), rpcCounters, + counterFactory).inc(); + } } // this implementation is tied directly to protobuf implementation details. would be better // if we could dispatch based on something static, ie, request Message type. diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/AbstractRpcClient.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/AbstractRpcClient.java index 8b56406ed58..23d14c272d2 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/AbstractRpcClient.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/ipc/AbstractRpcClient.java @@ -375,16 +375,15 @@ public abstract class AbstractRpcClient implements RpcC private void onCallFinished(Call call, HBaseRpcController hrc, Address addr, RpcCallback callback) { call.callStats.setCallTimeMs(EnvironmentEdgeManager.currentTime() - call.getStartTime()); - final boolean failed = (call.error != null) ? true : false; if (metrics != null) { - metrics.updateRpc(call.md, call.param, call.callStats, failed); + metrics.updateRpc(call.md, call.param, call.callStats, call.error); } if (LOG.isTraceEnabled()) { LOG.trace("CallId: {}, call: {}, startTime: {}ms, callTime: {}ms, status: {}", call.id, call.md.getName(), call.getStartTime(), call.callStats.getCallTimeMs(), - failed ? "failed" : "successful"); + call.error != null ? "failed" : "successful"); } - if (failed) { + if (call.error != null) { if (call.error instanceof RemoteException) { call.error.fillInStackTrace(); hrc.setFailed(call.error); diff --git a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestMetricsConnection.java b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestMetricsConnection.java index 498cf6c3d5e..66d81599427 100644 --- a/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestMetricsConnection.java +++ b/hbase-client/src/test/java/org/apache/hadoop/hbase/client/TestMetricsConnection.java @@ -33,6 +33,8 @@ import java.util.concurrent.Executors; import java.util.concurrent.ThreadPoolExecutor; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.ipc.CallTimeoutException; +import org.apache.hadoop.hbase.ipc.RemoteWithExtrasException; import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.testclassification.ClientTests; import org.apache.hadoop.hbase.testclassification.MetricsTests; @@ -177,51 +179,77 @@ public class TestMetricsConnection { for (int i = 0; i < loop; i++) { METRICS.updateRpc(ClientService.getDescriptor().findMethodByName("Get"), - GetRequest.getDefaultInstance(), MetricsConnection.newCallStats(), false); + GetRequest.getDefaultInstance(), MetricsConnection.newCallStats(), null); METRICS.updateRpc(ClientService.getDescriptor().findMethodByName("Scan"), - ScanRequest.getDefaultInstance(), MetricsConnection.newCallStats(), false); + ScanRequest.getDefaultInstance(), MetricsConnection.newCallStats(), + new RemoteWithExtrasException("java.io.IOException", null, false, false)); METRICS.updateRpc(ClientService.getDescriptor().findMethodByName("Multi"), - MultiRequest.getDefaultInstance(), MetricsConnection.newCallStats(), true); + MultiRequest.getDefaultInstance(), MetricsConnection.newCallStats(), + new CallTimeoutException("test with CallTimeoutException")); METRICS.updateRpc(ClientService.getDescriptor().findMethodByName("Mutate"), MutateRequest.newBuilder() .setMutation(ProtobufUtil.toMutation(MutationType.APPEND, new Append(foo))) .setRegion(region).build(), - MetricsConnection.newCallStats(), false); + MetricsConnection.newCallStats(), null); METRICS.updateRpc(ClientService.getDescriptor().findMethodByName("Mutate"), MutateRequest.newBuilder() .setMutation(ProtobufUtil.toMutation(MutationType.DELETE, new Delete(foo))) .setRegion(region).build(), - MetricsConnection.newCallStats(), false); + MetricsConnection.newCallStats(), null); METRICS.updateRpc(ClientService.getDescriptor().findMethodByName("Mutate"), MutateRequest.newBuilder() .setMutation(ProtobufUtil.toMutation(MutationType.INCREMENT, new Increment(foo))) .setRegion(region).build(), - MetricsConnection.newCallStats(), false); + MetricsConnection.newCallStats(), null); METRICS.updateRpc(ClientService.getDescriptor().findMethodByName("Mutate"), MutateRequest.newBuilder() .setMutation(ProtobufUtil.toMutation(MutationType.PUT, new Put(foo))).setRegion(region) .build(), - MetricsConnection.newCallStats(), false); + MetricsConnection.newCallStats(), null); } + final String rpcCountPrefix = "rpcCount_" + ClientService.getDescriptor().getName() + "_"; final String rpcFailureCountPrefix = "rpcFailureCount_" + ClientService.getDescriptor().getName() + "_"; String metricKey; long metricVal; Counter counter; - for (String method : new String[] { "Get", "Scan", "Mutate" }) { + + for (String method : new String[] { "Get", "Scan", "Multi", "Mutate" }) { metricKey = rpcCountPrefix + method; metricVal = METRICS.getRpcCounters().get(metricKey).getCount(); assertTrue("metric: " + metricKey + " val: " + metricVal, metricVal >= loop); + metricKey = rpcFailureCountPrefix + method; counter = METRICS.getRpcCounters().get(metricKey); metricVal = (counter != null) ? counter.getCount() : 0; - assertTrue("metric: " + metricKey + " val: " + metricVal, metricVal == 0); + if (method.equals("Get") || method.equals("Mutate")) { + // no failure + assertTrue("metric: " + metricKey + " val: " + metricVal, metricVal == 0); + } else { + // has failure + assertTrue("metric: " + metricKey + " val: " + metricVal, metricVal == loop); + } } - metricKey = rpcFailureCountPrefix + "Multi"; + + // remote exception + metricKey = "rpcRemoteExceptions_IOException"; counter = METRICS.getRpcCounters().get(metricKey); metricVal = (counter != null) ? counter.getCount() : 0; assertTrue("metric: " + metricKey + " val: " + metricVal, metricVal == loop); + + // local exception + metricKey = "rpcLocalExceptions_CallTimeoutException"; + counter = METRICS.getRpcCounters().get(metricKey); + metricVal = (counter != null) ? counter.getCount() : 0; + assertTrue("metric: " + metricKey + " val: " + metricVal, metricVal == loop); + + // total exception + metricKey = "rpcTotalExceptions"; + counter = METRICS.getRpcCounters().get(metricKey); + metricVal = (counter != null) ? counter.getCount() : 0; + assertTrue("metric: " + metricKey + " val: " + metricVal, metricVal == loop * 2); + for (MetricsConnection.CallTracker t : new MetricsConnection.CallTracker[] { METRICS.getGetTracker(), METRICS.getScanTracker(), METRICS.getMultiTracker(), METRICS.getAppendTracker(), METRICS.getDeleteTracker(), METRICS.getIncrementTracker(),