HBASE-25774 TestSyncReplicationStandbyKillRS#testStandbyKillRegionServer is flaky (#3189)

Wait for the restarter thread to finish before checking the state
Add more detailed logs

Signed-off-by: meiyi <myimeiyi@gmail.com>
This commit is contained in:
Duo Zhang 2021-04-22 10:10:15 +08:00 committed by GitHub
parent d5c5e48839
commit 50920ee306
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 18 additions and 8 deletions

View File

@ -68,12 +68,14 @@ public class TestSyncReplicationStandbyKillRS extends SyncReplicationTestBase {
UTIL1.shutdownMiniCluster(); UTIL1.shutdownMiniCluster();
JVMClusterUtil.MasterThread activeMaster = UTIL2.getMiniHBaseCluster().getMasterThread(); JVMClusterUtil.MasterThread activeMaster = UTIL2.getMiniHBaseCluster().getMasterThread();
String threadName = "RegionServer-Restarter";
Thread t = new Thread(() -> { Thread t = new Thread(() -> {
try { try {
List<JVMClusterUtil.RegionServerThread> regionServers = List<JVMClusterUtil.RegionServerThread> regionServers =
UTIL2.getMiniHBaseCluster().getLiveRegionServerThreads(); UTIL2.getMiniHBaseCluster().getLiveRegionServerThreads();
for (JVMClusterUtil.RegionServerThread rst : regionServers) { for (JVMClusterUtil.RegionServerThread rst : regionServers) {
ServerName serverName = rst.getRegionServer().getServerName(); ServerName serverName = rst.getRegionServer().getServerName();
LOG.debug("Going to stop [{}]", serverName);
rst.getRegionServer().stop("Stop RS for test"); rst.getRegionServer().stop("Stop RS for test");
waitForRSShutdownToStartAndFinish(activeMaster, serverName); waitForRSShutdownToStartAndFinish(activeMaster, serverName);
JVMClusterUtil.RegionServerThread restarted = JVMClusterUtil.RegionServerThread restarted =
@ -83,9 +85,11 @@ public class TestSyncReplicationStandbyKillRS extends SyncReplicationTestBase {
} catch (Exception e) { } catch (Exception e) {
LOG.error("Failed to kill RS", e); LOG.error("Failed to kill RS", e);
} }
}); }, threadName);
t.start(); t.start();
LOG.debug("Going to transit peer {} to {} state", PEER_ID,
SyncReplicationState.DOWNGRADE_ACTIVE);
// Transit standby to DA to replay logs // Transit standby to DA to replay logs
try { try {
UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID,
@ -94,11 +98,18 @@ public class TestSyncReplicationStandbyKillRS extends SyncReplicationTestBase {
LOG.error("Failed to transit standby cluster to " + SyncReplicationState.DOWNGRADE_ACTIVE, e); LOG.error("Failed to transit standby cluster to " + SyncReplicationState.DOWNGRADE_ACTIVE, e);
} }
while (UTIL2.getAdmin().getReplicationPeerSyncReplicationState(PEER_ID) LOG.debug("Waiting for the restarter thread {} to quit", threadName);
!= SyncReplicationState.DOWNGRADE_ACTIVE) { t.join();
while (UTIL2.getAdmin()
.getReplicationPeerSyncReplicationState(PEER_ID) != SyncReplicationState.DOWNGRADE_ACTIVE) {
LOG.debug("Waiting for peer {} to be in {} state", PEER_ID,
SyncReplicationState.DOWNGRADE_ACTIVE);
Thread.sleep(SLEEP_TIME); Thread.sleep(SLEEP_TIME);
} }
LOG.debug("Going to verify the result, {} records expected", COUNT);
verify(UTIL2, 0, COUNT); verify(UTIL2, 0, COUNT);
LOG.debug("Verification successfully done");
} }
private void waitForRSShutdownToStartAndFinish(JVMClusterUtil.MasterThread activeMaster, private void waitForRSShutdownToStartAndFinish(JVMClusterUtil.MasterThread activeMaster,
@ -106,15 +117,14 @@ public class TestSyncReplicationStandbyKillRS extends SyncReplicationTestBase {
ServerManager sm = activeMaster.getMaster().getServerManager(); ServerManager sm = activeMaster.getMaster().getServerManager();
// First wait for it to be in dead list // First wait for it to be in dead list
while (!sm.getDeadServers().isDeadServer(serverName)) { while (!sm.getDeadServers().isDeadServer(serverName)) {
LOG.debug("Waiting for [" + serverName + "] to be listed as dead in master"); LOG.debug("Waiting for {} to be listed as dead in master", serverName);
Thread.sleep(SLEEP_TIME); Thread.sleep(SLEEP_TIME);
} }
LOG.debug("Server [" + serverName + "] marked as dead, waiting for it to " + LOG.debug("Server {} marked as dead, waiting for it to finish dead processing", serverName);
"finish dead processing");
while (sm.areDeadServersInProgress()) { while (sm.areDeadServersInProgress()) {
LOG.debug("Server [" + serverName + "] still being processed, waiting"); LOG.debug("Server {} still being processed, waiting", serverName);
Thread.sleep(SLEEP_TIME); Thread.sleep(SLEEP_TIME);
} }
LOG.debug("Server [" + serverName + "] done with server shutdown processing"); LOG.debug("Server {} done with server shutdown processing", serverName);
} }
} }