HBASE-24545 Add backoff to SCP check on WAL split completion (#1891)
Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
parent
1dac9f69c4
commit
9fbf1f30c9
|
@ -314,6 +314,21 @@ public class SplitLogManager {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the amount of time in milliseconds to wait till next check.
|
||||
* Check less frequently if a bunch of work to do still. At a max, check every minute.
|
||||
* At a minimum, check every 100ms. This is to alleviate case where perhaps there are a bunch of
|
||||
* threads waiting on a completion. For example, if the zk-based implementation, we will scan the
|
||||
* '/hbase/splitWAL' dir every time through this loop. If there are lots of WALs to
|
||||
* split -- could be tens of thousands if big cluster -- then it will take a while. If
|
||||
* the Master has many SCPs waiting on wal splitting -- could be up to 10 x the configured
|
||||
* PE thread count (default would be 160) -- then the Master will be putting up a bunch of
|
||||
* load on zk.
|
||||
*/
|
||||
static int getBatchWaitTimeMillis(int remainingTasks) {
|
||||
return remainingTasks < 10? 100: remainingTasks < 100? 1000: 60_000;
|
||||
}
|
||||
|
||||
private void waitForSplittingCompletion(TaskBatch batch, MonitoredTask status) {
|
||||
synchronized (batch) {
|
||||
while ((batch.done + batch.error) != batch.installed) {
|
||||
|
@ -338,7 +353,7 @@ public class SplitLogManager {
|
|||
return;
|
||||
}
|
||||
}
|
||||
batch.wait(100);
|
||||
batch.wait(getBatchWaitTimeMillis(remainingTasks));
|
||||
if (server.isStopped()) {
|
||||
LOG.warn("Stopped while waiting for log splits to be completed");
|
||||
return;
|
||||
|
|
|
@ -308,7 +308,8 @@ public class ServerCrashProcedure
|
|||
MasterWalManager mwm = env.getMasterServices().getMasterWalManager();
|
||||
AssignmentManager am = env.getMasterServices().getAssignmentManager();
|
||||
// TODO: For Matteo. Below BLOCKs!!!! Redo so can relinquish executor while it is running.
|
||||
// PROBLEM!!! WE BLOCK HERE.
|
||||
// PROBLEM!!! WE BLOCK HERE. Can block for hours if hundreds of WALs to split and hundreds
|
||||
// of SCPs running because big cluster crashed down.
|
||||
am.getRegionStates().logSplitting(this.serverName);
|
||||
mwm.splitLog(this.serverName);
|
||||
if (!carryingMeta) {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/**
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
|
@ -156,6 +156,15 @@ public class TestSplitLogManager {
|
|||
TEST_UTIL.shutdownMiniZKCluster();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBatchWaitMillis() {
|
||||
assertEquals(100, SplitLogManager.getBatchWaitTimeMillis(0));
|
||||
assertEquals(100, SplitLogManager.getBatchWaitTimeMillis(1));
|
||||
assertEquals(1000, SplitLogManager.getBatchWaitTimeMillis(10));
|
||||
assertEquals(60_000, SplitLogManager.getBatchWaitTimeMillis(101));
|
||||
assertEquals(60_000, SplitLogManager.getBatchWaitTimeMillis(1011));
|
||||
}
|
||||
|
||||
private interface Expr {
|
||||
long eval();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue