HBASE-24075: Fix a race between master shutdown and metrics (re)init

JMXCacheBuster resets the metrics state at various points in time. These events can potentially race with a master shutdown. When the master is tearing down, metrics initialization can touch a lot of unsafe state, for example invalidated FS objects. To avoid this, this patch makes the getMetrics() a no-op when the master is either stopped or in the process of shutting down. Additionally, getClusterId() when the server is shutting down is made a no-op. Simulating a test for this is a bit tricky but with the patch I don't locally see the long stacktraces from the jira. Signed-off-by: Michael Stack <stack@apache.org> (cherry picked from commit 6f213e9d5a)
2020-03-31 00:16:15 -07:00 · 2020-03-31 00:16:15 -07:00 · 9384b84552
parent 40caac9b61
commit 9384b84552
6 changed files with 28 additions and 9 deletions
--- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java
+++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java
@ -29,6 +29,11 @@ import org.apache.yetus.audience.InterfaceAudience;
@InterfaceAudience.Private
 public interface MetricsMasterWrapper {

+  /**
+   * Returns if the master is currently running and is not attempting to shutdown.
+   */
+  boolean isRunning();
+
  /**
   * Get ServerName
   */
--- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java
+++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java
@ -82,7 +82,9 @@ public class MetricsMasterSourceImpl
    MetricsRecordBuilder metricsRecordBuilder = metricsCollector.addRecord(metricsName);

    // masterWrapper can be null because this function is called inside of init.
-    if (masterWrapper != null) {
+    // If the master is already stopped or has initiated a shutdown, no point in registering the
+    // metrics again.
+    if (masterWrapper != null && masterWrapper.isRunning()) {
      metricsRecordBuilder
          .addGauge(Interns.info(MERGE_PLAN_COUNT_NAME, MERGE_PLAN_COUNT_DESC),
              masterWrapper.getMergePlanCount())
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java
@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.ClusterId;
+import org.apache.hadoop.hbase.Server;
 import org.apache.hadoop.hbase.util.FSUtils;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
@ -46,8 +47,8 @@ public class CachedClusterId {
  public static final Logger LOG = LoggerFactory.getLogger(CachedClusterId.class);
  private static final int MAX_FETCH_TIMEOUT_MS = 10000;

-  private Path rootDir;
-  private FileSystem fs;
+  private final Path rootDir;
+  private final FileSystem fs;

  // When true, indicates that a FileSystem fetch of ClusterID is in progress. This is used to
  // avoid multiple fetches from FS and let only one thread fetch the information.
@ -58,12 +59,15 @@ public class CachedClusterId {
  // Immutable once set and read multiple times.
  private ClusterId clusterId;

+  private final Server server;
+
  // cache stats for testing.
  private AtomicInteger cacheMisses = new AtomicInteger(0);

-  public CachedClusterId(Configuration conf) throws IOException {
-    rootDir = FSUtils.getRootDir(conf);
-    fs = rootDir.getFileSystem(conf);
+  public CachedClusterId(Server server, Configuration conf) throws IOException {
+    this.rootDir = FSUtils.getRootDir(conf);
+    this.fs = rootDir.getFileSystem(conf);
+    this.server = server;
  }

  /**
@ -130,9 +134,12 @@ public class CachedClusterId {
   * trying get from a clean cache.
   *
   * @return ClusterId by reading from FileSystem or null in any error case or cluster ID does
-   *     not exist on the file system.
+   *     not exist on the file system or if the server initiated a tear down.
   */
  public String getFromCacheOrFetch() {
+    if (server.isStopping() || server.isStopped()) {
+      return null;
+    }
    String id = getClusterId();
    if (id != null) {
      return id;
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
@ -574,7 +574,7 @@ public class HMaster extends HRegionServer implements MasterServices {
        this.metaRegionLocationCache = null;
        this.activeMasterManager = null;
      }
-      cachedClusterId = new CachedClusterId(conf);
+      cachedClusterId = new CachedClusterId(this, conf);
    } catch (Throwable t) {
      // Make sure we log the exception. HMaster is often started via reflection and the
      // cause of failed startup is lost.
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java
@ -129,6 +129,10 @@ public class MetricsMasterWrapperImpl implements MetricsMasterWrapper {
    return serverManager.getDeadServers().size();
  }

+  @Override public boolean isRunning() {
+    return !(master.isStopped() || master.isStopping());
+  }
+
  @Override
  public String getServerName() {
    ServerName serverName = master.getServerName();
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java
@ -76,7 +76,8 @@ public class TestCachedClusterId {
  @Test
  public void testMultiThreadedGetClusterId() throws Exception {
    Configuration conf = TEST_UTIL.getConfiguration();
-    CachedClusterId cachedClusterId = new CachedClusterId(conf);
+    CachedClusterId cachedClusterId = new CachedClusterId(TEST_UTIL.getHBaseCluster().getMaster(),
+      conf);
    TestContext context = new TestContext(conf);
    int numThreads = 16;
    for (int i = 0; i < numThreads; i++) {