HBASE-25447 remoteProc is suspended due to OOM ERROR (#2824)
Some OMME can not cause the JVM to exit, like "java.lang.OutOfMemoryError: Direct buffer memory", "java.lang.OutOfMemoryError: unable to create new native thread", as they dont call vmError#next_OnError_command. So abort HMaster when uncaught exception occurs in TimeoutExecutor, the new active Hmaster will resume the suspended procedure. Signed-off-by: Duo Zhang <zhangduo@apache.org> Signed-off-by: stack <stack@apache.com> Signed-off-by: Pankaj Kumar<pankajkumar@apache.org> (cherry picked from commit600be60a4b
) (cherry picked from commitae77f81e7e
)
This commit is contained in:
parent
0a4685b085
commit
82a63abc8a
|
@ -106,6 +106,10 @@ public abstract class RemoteProcedureDispatcher<TEnv, TRemote extends Comparable
|
|||
return true;
|
||||
}
|
||||
|
||||
protected void setTimeoutExecutorUncaughtExceptionHandler(UncaughtExceptionHandler eh) {
|
||||
timeoutExecutor.setUncaughtExceptionHandler(eh);
|
||||
}
|
||||
|
||||
public boolean stop() {
|
||||
if (!running.getAndSet(false)) {
|
||||
return false;
|
||||
|
|
|
@ -94,6 +94,7 @@ public class RSProcedureDispatcher
|
|||
if (!super.start()) {
|
||||
return false;
|
||||
}
|
||||
setTimeoutExecutorUncaughtExceptionHandler(this::abort);
|
||||
if (master.isStopped()) {
|
||||
LOG.debug("Stopped");
|
||||
return false;
|
||||
|
@ -126,6 +127,13 @@ public class RSProcedureDispatcher
|
|||
return true;
|
||||
}
|
||||
|
||||
private void abort(Thread t, Throwable e) {
|
||||
LOG.error("Caught error", e);
|
||||
if (!master.isStopped() && !master.isStopping() && !master.isAborted()) {
|
||||
master.abort("Aborting master", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean stop() {
|
||||
if (!super.stop()) {
|
||||
|
|
Loading…
Reference in New Issue