HBASE-25156 TestMasterFailover.testSimpleMasterFailover is flaky (#2507)

Change the test to wait for evidence that the active master has seen that the backup master killed by the test has gone away. This is done before proceeding to validate that the dead backup is correctly omitted from the ClusterStatus report. Also, minor fixup to several assertions, using `assertEquals` instead of `assertTrue(...equals(...))` and correcting expected vs. actual ordering of assertion arguments. Signed-off-by: Michael Stack <stack@apache.org>
2020-10-08 14:23:38 -07:00 · 2020-10-08 14:23:38 -07:00 · b82d8a5517
parent bdcafa895c
commit b82d8a5517
1 changed files with 14 additions and 9 deletions
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
@ -1,4 +1,4 @@
-/**
+/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
@ -20,8 +20,8 @@ package org.apache.hadoop.hbase.master;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
-
 import java.util.List;
+import java.util.concurrent.TimeUnit;
 import org.apache.hadoop.hbase.ClusterMetrics;
 import org.apache.hadoop.hbase.HBaseClassTestRule;
 import org.apache.hadoop.hbase.HBaseTestingUtility;
@ -100,7 +100,7 @@ public class TestMasterFailover {
      // Check that ClusterStatus reports the correct active and backup masters
      assertNotNull(active);
      ClusterMetrics status = active.getClusterMetrics();
-      assertTrue(status.getMasterName().equals(activeName));
+      assertEquals(activeName, status.getMasterName());
      assertEquals(2, status.getBackupMasterNames().size());

      // attempt to stop one of the inactive masters
@ -113,7 +113,7 @@ public class TestMasterFailover {
      // Verify still one active master and it's the same
      for (int i = 0; i < masterThreads.size(); i++) {
        if (masterThreads.get(i).getMaster().isActiveMaster()) {
-          assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName()));
+          assertEquals(activeName, masterThreads.get(i).getMaster().getServerName());
          activeIndex = i;
          active = masterThreads.get(activeIndex).getMaster();
        }
@ -126,10 +126,15 @@ public class TestMasterFailover {
          " regions servers");
      assertEquals(3, rsCount);

+      // wait for the active master to acknowledge loss of the backup from ZK
+      final HMaster activeFinal = active;
+      TEST_UTIL.waitFor(
+        TimeUnit.SECONDS.toMillis(30), () -> activeFinal.getBackupMasters().size() == 1);
+
      // Check that ClusterStatus reports the correct active and backup masters
      assertNotNull(active);
      status = active.getClusterMetrics();
-      assertTrue(status.getMasterName().equals(activeName));
+      assertEquals(activeName, status.getMasterName());
      assertEquals(1, status.getBackupMasterNames().size());

      // kill the active master
@ -148,13 +153,13 @@ public class TestMasterFailover {
      active = masterThreads.get(0).getMaster();
      assertNotNull(active);
      status = active.getClusterMetrics();
-      ServerName mastername = status.getMasterName();
-      assertTrue(mastername.equals(active.getServerName()));
+      ServerName masterName = status.getMasterName();
+      assertNotNull(masterName);
+      assertEquals(active.getServerName(), masterName);
      assertTrue(active.isActiveMaster());
      assertEquals(0, status.getBackupMasterNames().size());
      int rss = status.getLiveServerMetrics().size();
-      LOG.info("Active master " + mastername.getServerName() + " managing " +
-          rss + " region servers");
+      LOG.info("Active master {} managing {} region servers", masterName.getServerName(), rss);
      assertEquals(3, rss);
    } finally {
      // Stop the cluster