From 94d576025fd158dae03dd6a6b50a62483b0f04e0 Mon Sep 17 00:00:00 2001 From: Mikhail Antonov Date: Wed, 16 Mar 2016 16:32:11 -0700 Subject: [PATCH] HBASE-15390 Unnecessary MetaCache evictions cause elevated number of requests to meta --- .../hbase/CallQueueTooBigException.java | 4 + .../hadoop/hbase/client/AsyncProcess.java | 2 +- .../hbase/client/ConnectionManager.java | 7 ++ .../apache/hadoop/hbase/client/MetaCache.java | 79 +++++++++---------- .../hbase/client/MetricsConnection.java | 33 +++++++- .../exceptions/ClientExceptionsUtil.java | 31 ++------ 6 files changed, 89 insertions(+), 67 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/CallQueueTooBigException.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/CallQueueTooBigException.java index d07c657a0d8..95ca9885d47 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/CallQueueTooBigException.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/CallQueueTooBigException.java @@ -30,4 +30,8 @@ public class CallQueueTooBigException extends IOException { public CallQueueTooBigException() { super(); } + + public CallQueueTooBigException(String message) { + super(message); + } } \ No newline at end of file diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncProcess.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncProcess.java index 3f7f0c0f1f4..d1fb6d19a65 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncProcess.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/AsyncProcess.java @@ -1164,7 +1164,7 @@ class AsyncProcess { Retry canRetry = errorsByServer.canRetryMore(numAttempt) ? Retry.YES : Retry.NO_RETRIES_EXHAUSTED; - if (tableName == null) { + if (tableName == null && ClientExceptionsUtil.isMetaClearingException(t)) { // tableName is null when we made a cross-table RPC call. connection.clearCaches(server); } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionManager.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionManager.java index dab5392ffba..2d55f6e3cab 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionManager.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionManager.java @@ -2210,6 +2210,9 @@ class ConnectionManager { if (regionName == null) { // we do not know which region, so just remove the cache entry for the row and server + if (metrics != null) { + metrics.incrCacheDroppingExceptions(exception); + } metaCache.clearCache(tableName, rowkey, source); return; } @@ -2249,6 +2252,10 @@ class ConnectionManager { } } + if (metrics != null) { + metrics.incrCacheDroppingExceptions(exception); + } + // If we're here, it means that can cannot be sure about the location, so we remove it from // the cache. Do not send the source because source can be a new server in the same host:port metaCache.clearCache(regionInfo); diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetaCache.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetaCache.java index c9f5e029b64..a1ff3d3cee0 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetaCache.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetaCache.java @@ -275,8 +275,13 @@ public class MetaCache { } this.cachedServers.remove(serverName); } - if (deletedSomething && LOG.isTraceEnabled()) { - LOG.trace("Removed all cached region locations that map to " + serverName); + if (deletedSomething) { + if (metrics != null) { + metrics.incrMetaCacheNumClearServer(); + } + if (LOG.isTraceEnabled()) { + LOG.trace("Removed all cached region locations that map to " + serverName); + } } } @@ -290,34 +295,6 @@ public class MetaCache { this.cachedRegionLocations.remove(tableName); } - /** - * Delete a cached location, no matter what it is. Called when we were told to not use cache. - * @param tableName tableName - * @param row - */ - public void clearCache(final TableName tableName, final byte [] row, int replicaId) { - ConcurrentMap tableLocations = getTableLocations(tableName); - - boolean removed = false; - RegionLocations regionLocations = getCachedLocation(tableName, row); - if (regionLocations != null) { - HRegionLocation toBeRemoved = regionLocations.getRegionLocation(replicaId); - RegionLocations updatedLocations = regionLocations.remove(replicaId); - if (updatedLocations != regionLocations) { - byte[] startKey = regionLocations.getRegionLocation().getRegionInfo().getStartKey(); - if (updatedLocations.isEmpty()) { - removed = tableLocations.remove(startKey, regionLocations); - } else { - removed = tableLocations.replace(startKey, regionLocations, updatedLocations); - } - } - - if (removed && LOG.isTraceEnabled() && toBeRemoved != null) { - LOG.trace("Removed " + toBeRemoved + " from cache"); - } - } - } - /** * Delete a cached location, no matter what it is. Called when we were told to not use cache. * @param tableName tableName @@ -330,8 +307,13 @@ public class MetaCache { if (regionLocations != null) { byte[] startKey = regionLocations.getRegionLocation().getRegionInfo().getStartKey(); boolean removed = tableLocations.remove(startKey, regionLocations); - if (removed && LOG.isTraceEnabled()) { - LOG.trace("Removed " + regionLocations + " from cache"); + if (removed) { + if (metrics != null) { + metrics.incrMetaCacheNumClearRegion(); + } + if (LOG.isTraceEnabled()) { + LOG.trace("Removed " + regionLocations + " from cache"); + } } } } @@ -353,9 +335,14 @@ public class MetaCache { } else { removed = tableLocations.replace(startKey, regionLocations, updatedLocations); } - if (removed && LOG.isTraceEnabled()) { - LOG.trace("Removed locations of table: " + tableName + " ,row: " + Bytes.toString(row) - + " mapping to server: " + serverName + " from cache"); + if (removed) { + if (metrics != null) { + metrics.incrMetaCacheNumClearRegion(); + } + if (LOG.isTraceEnabled()) { + LOG.trace("Removed locations of table: " + tableName + " ,row: " + Bytes.toString(row) + + " mapping to server: " + serverName + " from cache"); + } } } } @@ -372,15 +359,20 @@ public class MetaCache { HRegionLocation oldLocation = regionLocations.getRegionLocation(hri.getReplicaId()); if (oldLocation == null) return; RegionLocations updatedLocations = regionLocations.remove(oldLocation); - boolean removed = false; + boolean removed; if (updatedLocations != regionLocations) { if (updatedLocations.isEmpty()) { removed = tableLocations.remove(hri.getStartKey(), regionLocations); } else { removed = tableLocations.replace(hri.getStartKey(), regionLocations, updatedLocations); } - if (removed && LOG.isTraceEnabled()) { - LOG.trace("Removed " + oldLocation + " from cache"); + if (removed) { + if (metrics != null) { + metrics.incrMetaCacheNumClearRegion(); + } + if (LOG.isTraceEnabled()) { + LOG.trace("Removed " + oldLocation + " from cache"); + } } } } @@ -395,15 +387,20 @@ public class MetaCache { RegionLocations regionLocations = tableLocations.get(location.getRegionInfo().getStartKey()); if (regionLocations != null) { RegionLocations updatedLocations = regionLocations.remove(location); - boolean removed = false; + boolean removed; if (updatedLocations != regionLocations) { if (updatedLocations.isEmpty()) { removed = tableLocations.remove(location.getRegionInfo().getStartKey(), regionLocations); } else { removed = tableLocations.replace(location.getRegionInfo().getStartKey(), regionLocations, updatedLocations); } - if (removed && LOG.isTraceEnabled()) { - LOG.trace("Removed " + location + " from cache"); + if (removed) { + if (metrics != null) { + metrics.incrMetaCacheNumClearRegion(); + } + if (LOG.isTraceEnabled()) { + LOG.trace("Removed " + location + " from cache"); + } } } } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java index df279c61029..b6efdb93eb1 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java @@ -60,6 +60,7 @@ public class MetricsConnection { private static final String RESP_BASE = "rpcCallResponseSizeBytes_"; private static final String MEMLOAD_BASE = "memstoreLoad_"; private static final String HEAP_BASE = "heapOccupancy_"; + private static final String CACHE_BASE = "cacheDroppingExceptions_"; private static final String CLIENT_SVC = ClientService.getDescriptor().getName(); /** A container class for collecting details about the RPC call as it percolates. */ @@ -249,6 +250,12 @@ public class MetricsConnection { } }; + private final NewMetric counterFactory = new NewMetric() { + @Override public Counter newMetric(Class clazz, String name, String scope) { + return registry.newCounter(clazz, name, scope); + } + }; + // static metrics @VisibleForTesting protected final Counter metaCacheHits; @@ -261,6 +268,8 @@ public class MetricsConnection { @VisibleForTesting protected final CallTracker putTracker; @VisibleForTesting protected final CallTracker multiTracker; @VisibleForTesting protected final RunnerStats runnerStats; + private final Counter metaCacheNumClearServer; + private final Counter metaCacheNumClearRegion; // dynamic metrics @@ -272,6 +281,8 @@ public class MetricsConnection { @VisibleForTesting protected final ConcurrentMap rpcHistograms = new ConcurrentHashMap<>(CAPACITY * 2 /* tracking both request and response sizes */, LOAD_FACTOR, CONCURRENCY_LEVEL); + private final ConcurrentMap cacheDroppingExceptions = + new ConcurrentHashMap<>(CAPACITY, LOAD_FACTOR, CONCURRENCY_LEVEL); public MetricsConnection(final ConnectionManager.HConnectionImplementation conn) { this.scope = conn.toString(); @@ -299,6 +310,10 @@ public class MetricsConnection { }); this.metaCacheHits = registry.newCounter(this.getClass(), "metaCacheHits", scope); this.metaCacheMisses = registry.newCounter(this.getClass(), "metaCacheMisses", scope); + this.metaCacheNumClearServer = registry.newCounter(this.getClass(), + "metaCacheNumClearServer", scope); + this.metaCacheNumClearRegion = registry.newCounter(this.getClass(), + "metaCacheNumClearRegion", scope); this.getTracker = new CallTracker(this.registry, "Get", scope); this.scanTracker = new CallTracker(this.registry, "Scan", scope); this.appendTracker = new CallTracker(this.registry, "Mutate", "Append", scope); @@ -333,6 +348,16 @@ public class MetricsConnection { metaCacheMisses.inc(); } + /** Increment the number of meta cache drops requested for entire RegionServer. */ + public void incrMetaCacheNumClearServer() { + metaCacheNumClearServer.inc(); + } + + /** Increment the number of meta cache drops requested for individual region. */ + public void incrMetaCacheNumClearRegion() { + metaCacheNumClearRegion.inc(); + } + /** Increment the number of normal runner counts. */ public void incrNormalRunners() { this.runnerStats.incrNormalRunners(); @@ -355,7 +380,8 @@ public class MetricsConnection { T t = map.get(key); if (t == null) { t = factory.newMetric(this.getClass(), key, scope); - map.putIfAbsent(key, t); + T tmp = map.putIfAbsent(key, t); + t = (tmp == null) ? t : tmp; } return t; } @@ -427,4 +453,9 @@ public class MetricsConnection { // Fallback to dynamic registry lookup for DDL methods. updateRpcGeneric(method, stats); } + + public void incrCacheDroppingExceptions(Object exception) { + getMetric(CACHE_BASE + exception.getClass().getSimpleName(), + cacheDroppingExceptions, counterFactory).inc(); + } } diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/exceptions/ClientExceptionsUtil.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/exceptions/ClientExceptionsUtil.java index 160951b0b7e..f586dce07cc 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/exceptions/ClientExceptionsUtil.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/exceptions/ClientExceptionsUtil.java @@ -36,10 +36,8 @@ import org.apache.hadoop.hbase.RegionTooBusyException; import org.apache.hadoop.hbase.RetryImmediatelyException; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.classification.InterfaceStability; -import org.apache.hadoop.hbase.client.RetriesExhaustedException; import org.apache.hadoop.hbase.ipc.CallTimeoutException; import org.apache.hadoop.hbase.ipc.FailedServerException; -import org.apache.hadoop.hbase.ipc.RemoteWithExtrasException; import org.apache.hadoop.hbase.quotas.ThrottlingException; import org.apache.hadoop.ipc.RemoteException; @@ -63,7 +61,7 @@ public final class ClientExceptionsUtil { return (cur instanceof RegionMovedException || cur instanceof RegionOpeningException || cur instanceof RegionTooBusyException || cur instanceof ThrottlingException || cur instanceof MultiActionResultTooLarge || cur instanceof RetryImmediatelyException - || isCallQueueTooBigException(cur) || cur instanceof NotServingRegionException); + || cur instanceof CallQueueTooBigException || cur instanceof NotServingRegionException); } @@ -87,12 +85,8 @@ public final class ClientExceptionsUtil { } if (cur instanceof RemoteException) { RemoteException re = (RemoteException) cur; - cur = re.unwrapRemoteException( - RegionOpeningException.class, RegionMovedException.class, - RegionTooBusyException.class); - if (cur == null) { - cur = re.unwrapRemoteException(); - } + cur = re.unwrapRemoteException(); + // unwrapRemoteException can return the exception given as a parameter when it cannot // unwrap it. In this case, there is no need to look further // noinspection ObjectEquality @@ -110,25 +104,14 @@ public final class ClientExceptionsUtil { } /** - * Checks if the exception is CallQueueTooBig exception, or tries to unwrap - * {@link RemoteWithExtrasException} to see if we've got {@link CallQueueTooBigException}. + * Checks if the exception is CallQueueTooBig exception (maybe wrapped + * into some RemoteException). * @param t exception to check * @return true if it's a CQTBE, false otherwise */ public static boolean isCallQueueTooBigException(Throwable t) { - if (t instanceof CallQueueTooBigException) { - return true; - } - Throwable t2 = t; - if (t instanceof RetriesExhaustedException) { - t2 = t.getCause(); - } - if (t2 instanceof RemoteWithExtrasException) { - return CallQueueTooBigException.class.getName().equals( - ((RemoteWithExtrasException) t2).getClassName().trim()); - } else { - return false; - } + t = findException(t); + return (t instanceof CallQueueTooBigException); } /**