From 50a62497315b522333e0404727923f1bffca1156 Mon Sep 17 00:00:00 2001 From: Xiaolin Ha Date: Wed, 7 Jun 2023 16:31:39 +0800 Subject: [PATCH] HBASE-27897 ConnectionImplementation#locateRegionInMeta should pause and retry when taking user region lock failed (#5258) Signed-off-by: Wellington Chevreuil --- .../hbase/client/ConnectionImplementation.java | 17 +++++++++++------ .../hadoop/hbase/client/TestMetaCache.java | 14 +++++++------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java index b60da28c657..eef82643b5b 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java @@ -993,9 +993,12 @@ public class ConnectionImplementation implements ClusterConnection, Closeable { } // Query the meta region long pauseBase = connectionConfig.getPauseMillis(); - takeUserRegionLock(); - final long lockStartTime = EnvironmentEdgeManager.currentTime(); + long lockStartTime = 0; + boolean lockedUserRegion = false; try { + takeUserRegionLock(); + lockStartTime = EnvironmentEdgeManager.currentTime(); + lockedUserRegion = true; // We don't need to check if useCache is enabled or not. Even if useCache is false // we already cleared the cache for this row before acquiring userRegion lock so if this // row is present in cache that means some other thread has populated it while we were @@ -1104,10 +1107,12 @@ public class ConnectionImplementation implements ClusterConnection, Closeable { ConnectionUtils.getPauseTime(pauseBase, tries), TimeUnit.MILLISECONDS); } } finally { - userRegionLock.unlock(); - // update duration of the lock being held - if (metrics != null) { - metrics.updateUserRegionLockHeld(EnvironmentEdgeManager.currentTime() - lockStartTime); + if (lockedUserRegion) { + userRegionLock.unlock(); + // update duration of the lock being held + if (metrics != null) { + metrics.updateUserRegionLockHeld(EnvironmentEdgeManager.currentTime() - lockStartTime); + } } } try { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java index e79f0dc8e5d..778f6c4c57d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java @@ -592,21 +592,21 @@ public class TestMetaCache { // obtain the client metrics MetricsConnection metrics = conn.getConnectionMetrics(); long queueCount = metrics.getUserRegionLockQueue().getCount(); - assertEquals("Queue of userRegionLock should be updated twice. queueCount: " + queueCount, - queueCount, 2); + assertEquals("Queue of userRegionLock should be updated twice. queueCount: " + queueCount, 2, + queueCount); long timeoutCount = metrics.getUserRegionLockTimeout().getCount(); - assertEquals("Timeout of userRegionLock should happen once. timeoutCount: " + timeoutCount, - timeoutCount, 1); + assertEquals("Timeout of userRegionLock should happen once. timeoutCount: " + timeoutCount, 1, + timeoutCount); long waitingTimerCount = metrics.getUserRegionLockWaitingTimer().getCount(); assertEquals("userRegionLock should be grabbed successfully once. waitingTimerCount: " - + waitingTimerCount, waitingTimerCount, 1); + + waitingTimerCount, 1, waitingTimerCount); long heldTimerCount = metrics.getUserRegionLockHeldTimer().getCount(); assertEquals( - "userRegionLock should be held successfully once. heldTimerCount: " + heldTimerCount, - heldTimerCount, 1); + "userRegionLock should be held successfully once. heldTimerCount: " + heldTimerCount, 1, + heldTimerCount); double heldTime = metrics.getUserRegionLockHeldTimer().getSnapshot().getMax(); assertTrue("Max held time should be greater than 2 seconds. heldTime: " + heldTime, heldTime >= 2E9);