HBASE-27157 Potential race condition in WorkerAssigner (#4577)
Co-authored-by: huiruan <huiruan@tencent.com> Signed-off-by: Duo Zhang <zhangduo@apache.org> Signed-off-by: Lijin Bin <binlijin@apache.org>
This commit is contained in:
parent
6031a3a8d4
commit
f76d8554ca
|
@ -35,7 +35,6 @@ import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.fs.PathIsNotEmptyDirectoryException;
|
import org.apache.hadoop.fs.PathIsNotEmptyDirectoryException;
|
||||||
import org.apache.hadoop.hbase.HConstants;
|
import org.apache.hadoop.hbase.HConstants;
|
||||||
import org.apache.hadoop.hbase.ServerName;
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
|
|
||||||
import org.apache.hadoop.hbase.master.procedure.SplitWALProcedure;
|
import org.apache.hadoop.hbase.master.procedure.SplitWALProcedure;
|
||||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||||
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
||||||
|
@ -153,25 +152,19 @@ public class SplitWALManager {
|
||||||
*/
|
*/
|
||||||
public ServerName acquireSplitWALWorker(Procedure<?> procedure)
|
public ServerName acquireSplitWALWorker(Procedure<?> procedure)
|
||||||
throws ProcedureSuspendedException {
|
throws ProcedureSuspendedException {
|
||||||
Optional<ServerName> worker = splitWorkerAssigner.acquire();
|
ServerName worker = splitWorkerAssigner.acquire(procedure);
|
||||||
if (worker.isPresent()) {
|
LOG.debug("Acquired split WAL worker={}", worker);
|
||||||
LOG.debug("Acquired split WAL worker={}", worker.get());
|
return worker;
|
||||||
return worker.get();
|
|
||||||
}
|
|
||||||
splitWorkerAssigner.suspend(procedure);
|
|
||||||
throw new ProcedureSuspendedException();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* After the worker finished the split WAL task, it will release the worker, and wake up all the
|
* After the worker finished the split WAL task, it will release the worker, and wake up all the
|
||||||
* suspend procedures in the ProcedureEvent
|
* suspend procedures in the ProcedureEvent
|
||||||
* @param worker worker which is about to release
|
* @param worker worker which is about to release
|
||||||
* @param scheduler scheduler which is to wake up the procedure event
|
|
||||||
*/
|
*/
|
||||||
public void releaseSplitWALWorker(ServerName worker, MasterProcedureScheduler scheduler) {
|
public void releaseSplitWALWorker(ServerName worker) {
|
||||||
LOG.debug("Release split WAL worker={}", worker);
|
LOG.debug("Release split WAL worker={}", worker);
|
||||||
splitWorkerAssigner.release(worker);
|
splitWorkerAssigner.release(worker);
|
||||||
splitWorkerAssigner.wake(scheduler);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -23,9 +23,9 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import org.apache.hadoop.hbase.ServerName;
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
|
|
||||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||||
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
||||||
|
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||||
import org.apache.yetus.audience.InterfaceAudience;
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -51,7 +51,7 @@ public class WorkerAssigner implements ServerListener {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized Optional<ServerName> acquire() {
|
public synchronized ServerName acquire(Procedure<?> proc) throws ProcedureSuspendedException {
|
||||||
List<ServerName> serverList = master.getServerManager().getOnlineServersList();
|
List<ServerName> serverList = master.getServerManager().getOnlineServersList();
|
||||||
Collections.shuffle(serverList);
|
Collections.shuffle(serverList);
|
||||||
Optional<ServerName> worker = serverList.stream()
|
Optional<ServerName> worker = serverList.stream()
|
||||||
|
@ -60,27 +60,30 @@ public class WorkerAssigner implements ServerListener {
|
||||||
.findAny();
|
.findAny();
|
||||||
worker.ifPresent(name -> currentWorkers.compute(name, (serverName,
|
worker.ifPresent(name -> currentWorkers.compute(name, (serverName,
|
||||||
availableWorker) -> availableWorker == null ? maxTasks - 1 : availableWorker - 1));
|
availableWorker) -> availableWorker == null ? maxTasks - 1 : availableWorker - 1));
|
||||||
return worker;
|
if (worker.isPresent()) {
|
||||||
|
ServerName sn = worker.get();
|
||||||
|
currentWorkers.compute(sn, (serverName,
|
||||||
|
availableWorker) -> availableWorker == null ? maxTasks - 1 : availableWorker - 1);
|
||||||
|
return sn;
|
||||||
|
} else {
|
||||||
|
event.suspend();
|
||||||
|
event.suspendIfNotReady(proc);
|
||||||
|
throw new ProcedureSuspendedException();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized void release(ServerName serverName) {
|
public synchronized void release(ServerName serverName) {
|
||||||
currentWorkers.compute(serverName, (k, v) -> v == null ? null : v + 1);
|
currentWorkers.compute(serverName, (k, v) -> v == null ? null : v + 1);
|
||||||
}
|
|
||||||
|
|
||||||
public void suspend(Procedure<?> proc) {
|
|
||||||
event.suspend();
|
|
||||||
event.suspendIfNotReady(proc);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void wake(MasterProcedureScheduler scheduler) {
|
|
||||||
if (!event.isReady()) {
|
if (!event.isReady()) {
|
||||||
event.wake(scheduler);
|
event.wake(master.getMasterProcedureExecutor().getEnvironment().getProcedureScheduler());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void serverAdded(ServerName worker) {
|
public synchronized void serverAdded(ServerName worker) {
|
||||||
this.wake(master.getMasterProcedureExecutor().getEnvironment().getProcedureScheduler());
|
if (!event.isReady()) {
|
||||||
|
event.wake(master.getMasterProcedureExecutor().getEnvironment().getProcedureScheduler());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized void addUsedWorker(ServerName worker) {
|
public synchronized void addUsedWorker(ServerName worker) {
|
||||||
|
|
|
@ -109,8 +109,7 @@ public class SnapshotVerifyProcedure extends ServerRemoteProcedure
|
||||||
setFailure("verify-snapshot", e);
|
setFailure("verify-snapshot", e);
|
||||||
} finally {
|
} finally {
|
||||||
// release the worker
|
// release the worker
|
||||||
env.getMasterServices().getSnapshotManager().releaseSnapshotVerifyWorker(this, targetServer,
|
env.getMasterServices().getSnapshotManager().releaseSnapshotVerifyWorker(this, targetServer);
|
||||||
env.getProcedureScheduler());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -90,7 +90,7 @@ public class SplitWALProcedure
|
||||||
skipPersistence();
|
skipPersistence();
|
||||||
throw new ProcedureSuspendedException();
|
throw new ProcedureSuspendedException();
|
||||||
}
|
}
|
||||||
splitWALManager.releaseSplitWALWorker(worker, env.getProcedureScheduler());
|
splitWALManager.releaseSplitWALWorker(worker);
|
||||||
if (!finished) {
|
if (!finished) {
|
||||||
LOG.warn("Failed to split wal {} by server {}, retry...", walPath, worker);
|
LOG.warn("Failed to split wal {} by server {}, retry...", walPath, worker);
|
||||||
setNextState(MasterProcedureProtos.SplitWALState.ACQUIRE_SPLIT_WAL_WORKER);
|
setNextState(MasterProcedureProtos.SplitWALState.ACQUIRE_SPLIT_WAL_WORKER);
|
||||||
|
|
|
@ -63,7 +63,6 @@ import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
|
||||||
import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner;
|
import org.apache.hadoop.hbase.master.cleaner.HFileLinkCleaner;
|
||||||
import org.apache.hadoop.hbase.master.procedure.CloneSnapshotProcedure;
|
import org.apache.hadoop.hbase.master.procedure.CloneSnapshotProcedure;
|
||||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
|
|
||||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil;
|
import org.apache.hadoop.hbase.master.procedure.MasterProcedureUtil;
|
||||||
import org.apache.hadoop.hbase.master.procedure.RestoreSnapshotProcedure;
|
import org.apache.hadoop.hbase.master.procedure.RestoreSnapshotProcedure;
|
||||||
import org.apache.hadoop.hbase.master.procedure.SnapshotProcedure;
|
import org.apache.hadoop.hbase.master.procedure.SnapshotProcedure;
|
||||||
|
@ -1419,20 +1418,14 @@ public class SnapshotManager extends MasterProcedureManager implements Stoppable
|
||||||
|
|
||||||
public ServerName acquireSnapshotVerifyWorker(SnapshotVerifyProcedure procedure)
|
public ServerName acquireSnapshotVerifyWorker(SnapshotVerifyProcedure procedure)
|
||||||
throws ProcedureSuspendedException {
|
throws ProcedureSuspendedException {
|
||||||
Optional<ServerName> worker = verifyWorkerAssigner.acquire();
|
ServerName worker = verifyWorkerAssigner.acquire(procedure);
|
||||||
if (worker.isPresent()) {
|
LOG.debug("{} Acquired verify snapshot worker={}", procedure, worker);
|
||||||
LOG.debug("{} Acquired verify snapshot worker={}", procedure, worker.get());
|
return worker;
|
||||||
return worker.get();
|
|
||||||
}
|
|
||||||
verifyWorkerAssigner.suspend(procedure);
|
|
||||||
throw new ProcedureSuspendedException();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void releaseSnapshotVerifyWorker(SnapshotVerifyProcedure procedure, ServerName worker,
|
public void releaseSnapshotVerifyWorker(SnapshotVerifyProcedure procedure, ServerName worker) {
|
||||||
MasterProcedureScheduler scheduler) {
|
|
||||||
LOG.debug("{} Release verify snapshot worker={}", procedure, worker);
|
LOG.debug("{} Release verify snapshot worker={}", procedure, worker);
|
||||||
verifyWorkerAssigner.release(worker);
|
verifyWorkerAssigner.release(worker);
|
||||||
verifyWorkerAssigner.wake(scheduler);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void restoreWorkers() {
|
private void restoreWorkers() {
|
||||||
|
|
|
@ -115,8 +115,7 @@ public class TestSplitWALManager {
|
||||||
Assert.assertNotNull(e);
|
Assert.assertNotNull(e);
|
||||||
Assert.assertTrue(e instanceof ProcedureSuspendedException);
|
Assert.assertTrue(e instanceof ProcedureSuspendedException);
|
||||||
|
|
||||||
splitWALManager.releaseSplitWALWorker(server, TEST_UTIL.getHBaseCluster().getMaster()
|
splitWALManager.releaseSplitWALWorker(server);
|
||||||
.getMasterProcedureExecutor().getEnvironment().getProcedureScheduler());
|
|
||||||
Assert.assertNotNull(splitWALManager.acquireSplitWALWorker(testProcedures.get(3)));
|
Assert.assertNotNull(splitWALManager.acquireSplitWALWorker(testProcedures.get(3)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -348,7 +347,7 @@ public class TestSplitWALManager {
|
||||||
setNextState(MasterProcedureProtos.SplitWALState.RELEASE_SPLIT_WORKER);
|
setNextState(MasterProcedureProtos.SplitWALState.RELEASE_SPLIT_WORKER);
|
||||||
return Flow.HAS_MORE_STATE;
|
return Flow.HAS_MORE_STATE;
|
||||||
case RELEASE_SPLIT_WORKER:
|
case RELEASE_SPLIT_WORKER:
|
||||||
splitWALManager.releaseSplitWALWorker(worker, env.getProcedureScheduler());
|
splitWALManager.releaseSplitWALWorker(worker);
|
||||||
return Flow.NO_MORE_STATE;
|
return Flow.NO_MORE_STATE;
|
||||||
default:
|
default:
|
||||||
throw new UnsupportedOperationException("unhandled state=" + state);
|
throw new UnsupportedOperationException("unhandled state=" + state);
|
||||||
|
|
Loading…
Reference in New Issue