HBASE-24545 Add backoff to SCP check on WAL split completion (#1891)

Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
Michael Stack 2020-06-12 07:57:07 -07:00 committed by stack
parent d187ce2029
commit a4e1d073f4
3 changed files with 28 additions and 3 deletions

View File

@ -314,6 +314,21 @@ public class SplitLogManager {
return false;
}
/**
* Get the amount of time in milliseconds to wait till next check.
* Check less frequently if a bunch of work to do still. At a max, check every minute.
* At a minimum, check every 100ms. This is to alleviate case where perhaps there are a bunch of
* threads waiting on a completion. For example, if the zk-based implementation, we will scan the
* '/hbase/splitWAL' dir every time through this loop. If there are lots of WALs to
* split -- could be tens of thousands if big cluster -- then it will take a while. If
* the Master has many SCPs waiting on wal splitting -- could be up to 10 x the configured
* PE thread count (default would be 160) -- then the Master will be putting up a bunch of
* load on zk.
*/
static int getBatchWaitTimeMillis(int remainingTasks) {
return remainingTasks < 10? 100: remainingTasks < 100? 1000: 60_000;
}
private void waitForSplittingCompletion(TaskBatch batch, MonitoredTask status) {
synchronized (batch) {
while ((batch.done + batch.error) != batch.installed) {
@ -338,7 +353,7 @@ public class SplitLogManager {
return;
}
}
batch.wait(100);
batch.wait(getBatchWaitTimeMillis(remainingTasks));
if (server.isStopped()) {
LOG.warn("Stopped while waiting for log splits to be completed");
return;

View File

@ -308,7 +308,8 @@ public class ServerCrashProcedure
MasterWalManager mwm = env.getMasterServices().getMasterWalManager();
AssignmentManager am = env.getMasterServices().getAssignmentManager();
// TODO: For Matteo. Below BLOCKs!!!! Redo so can relinquish executor while it is running.
// PROBLEM!!! WE BLOCK HERE.
// PROBLEM!!! WE BLOCK HERE. Can block for hours if hundreds of WALs to split and hundreds
// of SCPs running because big cluster crashed down.
am.getRegionStates().logSplitting(this.serverName);
mwm.splitLog(this.serverName);
if (!carryingMeta) {

View File

@ -1,4 +1,4 @@
/**
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@ -156,6 +156,15 @@ public class TestSplitLogManager {
TEST_UTIL.shutdownMiniZKCluster();
}
@Test
public void testBatchWaitMillis() {
assertEquals(100, SplitLogManager.getBatchWaitTimeMillis(0));
assertEquals(100, SplitLogManager.getBatchWaitTimeMillis(1));
assertEquals(1000, SplitLogManager.getBatchWaitTimeMillis(10));
assertEquals(60_000, SplitLogManager.getBatchWaitTimeMillis(101));
assertEquals(60_000, SplitLogManager.getBatchWaitTimeMillis(1011));
}
private interface Expr {
long eval();
}