HBASE-20978 [amv2] Worker terminating UNNATURALLY during MoveRegionProcedure
Signed-off-by: Michael Stack <stack@apache.org> Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
parent
a83073aff0
commit
bdca019b9e
|
@ -83,12 +83,34 @@ public class ProcedureExecutor<TEnvironment> {
|
||||||
"hbase.procedure.worker.keep.alive.time.msec";
|
"hbase.procedure.worker.keep.alive.time.msec";
|
||||||
private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = TimeUnit.MINUTES.toMillis(1);
|
private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = TimeUnit.MINUTES.toMillis(1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link #testing} is non-null when ProcedureExecutor is being tested. Tests will try to
|
||||||
|
* break PE having it fail at various junctures. When non-null, testing is set to an instance of
|
||||||
|
* the below internal {@link Testing} class with flags set for the particular test.
|
||||||
|
*/
|
||||||
Testing testing = null;
|
Testing testing = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class with parameters describing how to fail/die when in testing-context.
|
||||||
|
*/
|
||||||
public static class Testing {
|
public static class Testing {
|
||||||
protected boolean killIfSuspended = false;
|
protected boolean killIfSuspended = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Kill the PE BEFORE we store state to the WAL. Good for figuring out if a Procedure is
|
||||||
|
* persisting all the state it needs to recover after a crash.
|
||||||
|
*/
|
||||||
protected boolean killBeforeStoreUpdate = false;
|
protected boolean killBeforeStoreUpdate = false;
|
||||||
protected boolean toggleKillBeforeStoreUpdate = false;
|
protected boolean toggleKillBeforeStoreUpdate = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set when we want to fail AFTER state has been stored into the WAL. Rarely used. HBASE-20978
|
||||||
|
* is about a case where memory-state was being set after store to WAL where a crash could
|
||||||
|
* cause us to get stuck. This flag allows killing at what was a vulnerable time.
|
||||||
|
*/
|
||||||
|
protected boolean killAfterStoreUpdate = false;
|
||||||
|
protected boolean toggleKillAfterStoreUpdate = false;
|
||||||
|
|
||||||
protected boolean shouldKillBeforeStoreUpdate() {
|
protected boolean shouldKillBeforeStoreUpdate() {
|
||||||
final boolean kill = this.killBeforeStoreUpdate;
|
final boolean kill = this.killBeforeStoreUpdate;
|
||||||
if (this.toggleKillBeforeStoreUpdate) {
|
if (this.toggleKillBeforeStoreUpdate) {
|
||||||
|
@ -101,6 +123,19 @@ public class ProcedureExecutor<TEnvironment> {
|
||||||
protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) {
|
protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) {
|
||||||
return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate();
|
return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected boolean shouldKillAfterStoreUpdate() {
|
||||||
|
final boolean kill = this.killAfterStoreUpdate;
|
||||||
|
if (this.toggleKillAfterStoreUpdate) {
|
||||||
|
this.killAfterStoreUpdate = !kill;
|
||||||
|
LOG.warn("Toggle KILL after store update to: " + this.killAfterStoreUpdate);
|
||||||
|
}
|
||||||
|
return kill;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean shouldKillAfterStoreUpdate(final boolean isSuspended) {
|
||||||
|
return (isSuspended && !killIfSuspended) ? false : shouldKillAfterStoreUpdate();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public interface ProcedureExecutorListener {
|
public interface ProcedureExecutorListener {
|
||||||
|
@ -503,6 +538,17 @@ public class ProcedureExecutor<TEnvironment> {
|
||||||
break;
|
break;
|
||||||
case WAITING:
|
case WAITING:
|
||||||
if (!proc.hasChildren()) {
|
if (!proc.hasChildren()) {
|
||||||
|
// Normally, WAITING procedures should be waken by its children.
|
||||||
|
// But, there is a case that, all the children are successful and before
|
||||||
|
// they can wake up their parent procedure, the master was killed.
|
||||||
|
// So, during recovering the procedures from ProcedureWal, its children
|
||||||
|
// are not loaded because of their SUCCESS state.
|
||||||
|
// So we need to continue to run this WAITING procedure. But before
|
||||||
|
// executing, we need to set its state to RUNNABLE, otherwise, a exception
|
||||||
|
// will throw:
|
||||||
|
// Preconditions.checkArgument(procedure.getState() == ProcedureState.RUNNABLE,
|
||||||
|
// "NOT RUNNABLE! " + procedure.toString());
|
||||||
|
proc.setState(ProcedureState.RUNNABLE);
|
||||||
runnableList.add(proc);
|
runnableList.add(proc);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -1562,10 +1608,7 @@ public class ProcedureExecutor<TEnvironment> {
|
||||||
// allows to kill the executor before something is stored to the wal.
|
// allows to kill the executor before something is stored to the wal.
|
||||||
// useful to test the procedure recovery.
|
// useful to test the procedure recovery.
|
||||||
if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) {
|
if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) {
|
||||||
String msg = "TESTING: Kill before store update: " + procedure;
|
kill("TESTING: Kill BEFORE store update: " + procedure);
|
||||||
LOG.debug(msg);
|
|
||||||
stop();
|
|
||||||
throw new RuntimeException(msg);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: The code here doesn't check if store is running before persisting to the store as
|
// TODO: The code here doesn't check if store is running before persisting to the store as
|
||||||
|
@ -1591,6 +1634,14 @@ public class ProcedureExecutor<TEnvironment> {
|
||||||
|
|
||||||
assert (reExecute && subprocs == null) || !reExecute;
|
assert (reExecute && subprocs == null) || !reExecute;
|
||||||
} while (reExecute);
|
} while (reExecute);
|
||||||
|
|
||||||
|
// Allows to kill the executor after something is stored to the WAL but before the below
|
||||||
|
// state settings are done -- in particular the one on the end where we make parent
|
||||||
|
// RUNNABLE again when its children are done; see countDownChildren.
|
||||||
|
if (testing != null && testing.shouldKillAfterStoreUpdate(suspended)) {
|
||||||
|
kill("TESTING: Kill AFTER store update: " + procedure);
|
||||||
|
}
|
||||||
|
|
||||||
// Submit the new subprocedures
|
// Submit the new subprocedures
|
||||||
if (subprocs != null && !procedure.isFailed()) {
|
if (subprocs != null && !procedure.isFailed()) {
|
||||||
submitChildrenProcedures(subprocs);
|
submitChildrenProcedures(subprocs);
|
||||||
|
@ -1608,6 +1659,12 @@ public class ProcedureExecutor<TEnvironment> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void kill(String msg) {
|
||||||
|
LOG.debug(msg);
|
||||||
|
stop();
|
||||||
|
throw new RuntimeException(msg);
|
||||||
|
}
|
||||||
|
|
||||||
private Procedure<TEnvironment>[] initializeChildren(RootProcedureState<TEnvironment> procStack,
|
private Procedure<TEnvironment>[] initializeChildren(RootProcedureState<TEnvironment> procStack,
|
||||||
Procedure<TEnvironment> procedure, Procedure<TEnvironment>[] subprocs) {
|
Procedure<TEnvironment> procedure, Procedure<TEnvironment>[] subprocs) {
|
||||||
assert subprocs != null : "expected subprocedures";
|
assert subprocs != null : "expected subprocedures";
|
||||||
|
|
|
@ -167,6 +167,13 @@ public class ProcedureTestingUtility {
|
||||||
assertSingleExecutorForKillTests(procExecutor);
|
assertSingleExecutorForKillTests(procExecutor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <TEnv> void toggleKillAfterStoreUpdate(ProcedureExecutor<TEnv> procExecutor) {
|
||||||
|
createExecutorTesting(procExecutor);
|
||||||
|
procExecutor.testing.killAfterStoreUpdate = !procExecutor.testing.killAfterStoreUpdate;
|
||||||
|
LOG.warn("Set Kill after store update to: " + procExecutor.testing.killAfterStoreUpdate);
|
||||||
|
assertSingleExecutorForKillTests(procExecutor);
|
||||||
|
}
|
||||||
|
|
||||||
public static <TEnv> void setKillAndToggleBeforeStoreUpdate(ProcedureExecutor<TEnv> procExecutor,
|
public static <TEnv> void setKillAndToggleBeforeStoreUpdate(ProcedureExecutor<TEnv> procExecutor,
|
||||||
boolean value) {
|
boolean value) {
|
||||||
ProcedureTestingUtility.setKillBeforeStoreUpdate(procExecutor, value);
|
ProcedureTestingUtility.setKillBeforeStoreUpdate(procExecutor, value);
|
||||||
|
|
|
@ -109,6 +109,29 @@ public class TestChildProcedures {
|
||||||
ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
|
ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the state setting that happens after store to WAL; in particular the bit where we
|
||||||
|
* set the parent runnable again after its children have all completed successfully.
|
||||||
|
* See HBASE-20978.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testChildLoadWithRestartAfterChildSuccess() throws Exception {
|
||||||
|
procEnv.toggleKillAfterStoreUpdate = true;
|
||||||
|
|
||||||
|
TestRootProcedure proc = new TestRootProcedure();
|
||||||
|
long procId = ProcedureTestingUtility.submitAndWait(procExecutor, proc);
|
||||||
|
int restartCount = 0;
|
||||||
|
while (!procExecutor.isFinished(procId)) {
|
||||||
|
ProcedureTestingUtility.restart(procExecutor);
|
||||||
|
ProcedureTestingUtility.waitProcedure(procExecutor, proc);
|
||||||
|
restartCount++;
|
||||||
|
}
|
||||||
|
assertEquals(4, restartCount);
|
||||||
|
assertTrue("expected completed proc", procExecutor.isFinished(procId));
|
||||||
|
ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testChildRollbackLoad() throws Exception {
|
public void testChildRollbackLoad() throws Exception {
|
||||||
procEnv.toggleKillBeforeStoreUpdate = false;
|
procEnv.toggleKillBeforeStoreUpdate = false;
|
||||||
|
@ -154,6 +177,9 @@ public class TestChildProcedures {
|
||||||
if (env.toggleKillBeforeStoreUpdate) {
|
if (env.toggleKillBeforeStoreUpdate) {
|
||||||
ProcedureTestingUtility.toggleKillBeforeStoreUpdate(procExecutor);
|
ProcedureTestingUtility.toggleKillBeforeStoreUpdate(procExecutor);
|
||||||
}
|
}
|
||||||
|
if (env.toggleKillAfterStoreUpdate) {
|
||||||
|
ProcedureTestingUtility.toggleKillAfterStoreUpdate(procExecutor);
|
||||||
|
}
|
||||||
return new Procedure[] { new TestChildProcedure(), new TestChildProcedure() };
|
return new Procedure[] { new TestChildProcedure(), new TestChildProcedure() };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -193,6 +219,7 @@ public class TestChildProcedures {
|
||||||
|
|
||||||
private static class TestProcEnv {
|
private static class TestProcEnv {
|
||||||
public boolean toggleKillBeforeStoreUpdate = false;
|
public boolean toggleKillBeforeStoreUpdate = false;
|
||||||
|
public boolean toggleKillAfterStoreUpdate = false;
|
||||||
public boolean triggerRollbackOnChild = false;
|
public boolean triggerRollbackOnChild = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,10 @@
|
||||||
*/
|
*/
|
||||||
-->
|
-->
|
||||||
<configuration>
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>hbase.defaults.for.version.skip</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hbase.procedure.store.wal.use.hsync</name>
|
<name>hbase.procedure.store.wal.use.hsync</name>
|
||||||
<value>false</value>
|
<value>false</value>
|
||||||
|
|
Loading…
Reference in New Issue