HBASE-25447 remoteProc is suspended due to OOM ERROR (#2824)

Some OMME can not cause the JVM to exit, like "java.lang.OutOfMemoryError: Direct buffer memory", "java.lang.OutOfMemoryError: unable to create new native thread", as they dont call vmError#next_OnError_command. So abort HMaster when uncaught exception occurs in TimeoutExecutor, the new active Hmaster will resume the suspended procedure.

Signed-off-by: Duo Zhang <zhangduo@apache.org>
Signed-off-by: stack <stack@apache.com>
Signed-off-by: Pankaj Kumar<pankajkumar@apache.org>
(cherry picked from commit 600be60a4b)
(cherry picked from commit ae77f81e7e)
This commit is contained in:
Bo Cui 2021-01-04 23:34:38 +08:00 committed by Pankaj Kumar
parent 0a4685b085
commit 82a63abc8a
2 changed files with 12 additions and 0 deletions

View File

@ -106,6 +106,10 @@ public abstract class RemoteProcedureDispatcher<TEnv, TRemote extends Comparable
return true; return true;
} }
protected void setTimeoutExecutorUncaughtExceptionHandler(UncaughtExceptionHandler eh) {
timeoutExecutor.setUncaughtExceptionHandler(eh);
}
public boolean stop() { public boolean stop() {
if (!running.getAndSet(false)) { if (!running.getAndSet(false)) {
return false; return false;

View File

@ -94,6 +94,7 @@ public class RSProcedureDispatcher
if (!super.start()) { if (!super.start()) {
return false; return false;
} }
setTimeoutExecutorUncaughtExceptionHandler(this::abort);
if (master.isStopped()) { if (master.isStopped()) {
LOG.debug("Stopped"); LOG.debug("Stopped");
return false; return false;
@ -126,6 +127,13 @@ public class RSProcedureDispatcher
return true; return true;
} }
private void abort(Thread t, Throwable e) {
LOG.error("Caught error", e);
if (!master.isStopped() && !master.isStopping() && !master.isAborted()) {
master.abort("Aborting master", e);
}
}
@Override @Override
public boolean stop() { public boolean stop() {
if (!super.stop()) { if (!super.stop()) {