From bcef28eefaf192b0ad48c8011f98b8e944340da5 Mon Sep 17 00:00:00 2001 From: Elliott Clark Date: Thu, 20 Aug 2015 13:32:39 -0700 Subject: [PATCH] HBASE-14274 Deadlock in region metrics on shutdown: MetricsRegionSourceImpl vs MetricsRegionAggregateSourceImpl Signed-off-by: stack --- .../MetricsRegionAggregateSourceImpl.java | 53 +++++------------- .../regionserver/MetricsRegionSourceImpl.java | 56 ++++++++++--------- 2 files changed, 44 insertions(+), 65 deletions(-) diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionAggregateSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionAggregateSourceImpl.java index fdb3e83c416..009fa9c6705 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionAggregateSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionAggregateSourceImpl.java @@ -18,10 +18,10 @@ package org.apache.hadoop.hbase.regionserver; -import java.util.HashSet; +import java.util.Collections; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.metrics.BaseSourceImpl; @@ -34,12 +34,11 @@ import org.apache.hadoop.metrics2.lib.MetricsExecutorImpl; @InterfaceAudience.Private public class MetricsRegionAggregateSourceImpl extends BaseSourceImpl implements MetricsRegionAggregateSource { - // lock to guard against concurrent access to regionSources - private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); + private final MetricsExecutorImpl executor = new MetricsExecutorImpl(); - private final HashSet regionSources = - new HashSet(); + private final Set regionSources = + Collections.newSetFromMap(new ConcurrentHashMap()); public MetricsRegionAggregateSourceImpl() { this(METRICS_NAME, METRICS_DESCRIPTION, METRICS_CONTEXT, METRICS_JMX_CONTEXT); @@ -62,36 +61,18 @@ public class MetricsRegionAggregateSourceImpl extends BaseSourceImpl @Override public void register(MetricsRegionSource source) { - Lock l = lock.writeLock(); - l.lock(); - try { - regionSources.add(source); - clearCache(); - } finally { - l.unlock(); - } + regionSources.add(source); + clearCache(); } @Override public void deregister(MetricsRegionSource toRemove) { - Lock l = lock.writeLock(); - l.lock(); - try { - regionSources.remove(toRemove); - clearCache(); - } finally { - l.unlock(); - } + regionSources.remove(toRemove); + clearCache(); } private synchronized void clearCache() { JmxCacheBuster.clearJmxCache(true); - executor.getExecutor().schedule(new Runnable() { - @Override - public void run() { - JmxCacheBuster.clearJmxCache(); - } - }, 5, TimeUnit.MINUTES); } /** @@ -107,18 +88,12 @@ public class MetricsRegionAggregateSourceImpl extends BaseSourceImpl MetricsRecordBuilder mrb = collector.addRecord(metricsName); if (regionSources != null) { - Lock l = lock.readLock(); - l.lock(); - try { - for (MetricsRegionSource regionMetricSource : regionSources) { - if (regionMetricSource instanceof MetricsRegionSourceImpl) { - ((MetricsRegionSourceImpl) regionMetricSource).snapshot(mrb, all); - } + for (MetricsRegionSource regionMetricSource : regionSources) { + if (regionMetricSource instanceof MetricsRegionSourceImpl) { + ((MetricsRegionSourceImpl) regionMetricSource).snapshot(mrb, all); } - mrb.addGauge(Interns.info(NUM_REGIONS, NUMBER_OF_REGIONS_DESC), regionSources.size()); - } finally { - l.unlock(); } + mrb.addGauge(Interns.info(NUM_REGIONS, NUMBER_OF_REGIONS_DESC), regionSources.size()); metricsRegistry.snapshot(mrb, all); } } diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionSourceImpl.java index 7290c56f637..5b3e6c4f0f0 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/regionserver/MetricsRegionSourceImpl.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.regionserver; import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -39,12 +40,7 @@ public class MetricsRegionSourceImpl implements MetricsRegionSource { private static final Log LOG = LogFactory.getLog(MetricsRegionSourceImpl.class); - private boolean closed = false; - - // lock to ensure that lock and pushing metrics can't race. - // When changing or acting on the closed boolean this lock must be held. - // The write lock must be held when changing closed. - private final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(false); + private AtomicBoolean closed = new AtomicBoolean(false); // Non-final so that we can null out the wrapper // This is just paranoia. We really really don't want to @@ -109,16 +105,20 @@ public class MetricsRegionSourceImpl implements MetricsRegionSource { @Override public void close() { - Lock lock = readWriteLock.writeLock(); - lock.lock(); - try { - if (closed) { - return; - } + boolean wasClosed = closed.getAndSet(false); - closed = true; - agg.deregister(this); + // Has someone else already closed this for us? + if (wasClosed) { + return; + } + // Before removing the metrics remove this region from the aggregate region bean. + // This should mean that it's unlikely that snapshot and close happen at the same time. + agg.deregister(this); + + // While it's un-likely that snapshot and close happen at the same time it's still possible. + // So grab the lock to ensure that all calls to snapshot are done before we remove the metrics + synchronized (this) { if (LOG.isTraceEnabled()) { LOG.trace("Removing region Metrics: " + regionWrapper.getRegionName()); } @@ -133,10 +133,6 @@ public class MetricsRegionSourceImpl implements MetricsRegionSource { registry.removeHistogramMetrics(regionScanNextKey); regionWrapper = null; - - JmxCacheBuster.clearJmxCache(); - } finally { - lock.unlock(); } } @@ -186,13 +182,23 @@ public class MetricsRegionSourceImpl implements MetricsRegionSource { } void snapshot(MetricsRecordBuilder mrb, boolean ignored) { - Lock lock = readWriteLock.readLock(); - // Grab the read lock. - // This ensures that - lock.lock(); - try { - if (closed) { + // If there is a close that started be double extra sure + // that we're not getting any locks and not putting data + // into the metrics that should be removed. So early out + // before even getting the lock. + if (closed.get()) { + return; + } + + // Grab the read + // This ensures that removes of the metrics + // can't happen while we are putting them back in. + synchronized (this) { + + // It's possible that a close happened between checking + // the closed variable and getting the lock. + if (closed.get()) { return; } @@ -254,8 +260,6 @@ public class MetricsRegionSourceImpl implements MetricsRegionSource { MetricsRegionSource.COPROCESSOR_EXECUTION_STATISTICS_DESC + "99th percentile: "), ds .getPercentile(99d) / 1000); } - } finally { - lock.unlock(); } }