HBASE-20978 [amv2] Worker terminating UNNATURALLY during MoveRegionProcedure

Signed-off-by: Michael Stack <stack@apache.org>
Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
Allan Yang 2018-08-14 12:09:42 -07:00 committed by zhangduo
parent a83073aff0
commit bdca019b9e
4 changed files with 99 additions and 4 deletions

View File

@ -83,12 +83,34 @@ public class ProcedureExecutor<TEnvironment> {
"hbase.procedure.worker.keep.alive.time.msec";
private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = TimeUnit.MINUTES.toMillis(1);
/**
* {@link #testing} is non-null when ProcedureExecutor is being tested. Tests will try to
* break PE having it fail at various junctures. When non-null, testing is set to an instance of
* the below internal {@link Testing} class with flags set for the particular test.
*/
Testing testing = null;
/**
* Class with parameters describing how to fail/die when in testing-context.
*/
public static class Testing {
protected boolean killIfSuspended = false;
/**
* Kill the PE BEFORE we store state to the WAL. Good for figuring out if a Procedure is
* persisting all the state it needs to recover after a crash.
*/
protected boolean killBeforeStoreUpdate = false;
protected boolean toggleKillBeforeStoreUpdate = false;
/**
* Set when we want to fail AFTER state has been stored into the WAL. Rarely used. HBASE-20978
* is about a case where memory-state was being set after store to WAL where a crash could
* cause us to get stuck. This flag allows killing at what was a vulnerable time.
*/
protected boolean killAfterStoreUpdate = false;
protected boolean toggleKillAfterStoreUpdate = false;
protected boolean shouldKillBeforeStoreUpdate() {
final boolean kill = this.killBeforeStoreUpdate;
if (this.toggleKillBeforeStoreUpdate) {
@ -101,6 +123,19 @@ public class ProcedureExecutor<TEnvironment> {
protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) {
return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate();
}
protected boolean shouldKillAfterStoreUpdate() {
final boolean kill = this.killAfterStoreUpdate;
if (this.toggleKillAfterStoreUpdate) {
this.killAfterStoreUpdate = !kill;
LOG.warn("Toggle KILL after store update to: " + this.killAfterStoreUpdate);
}
return kill;
}
protected boolean shouldKillAfterStoreUpdate(final boolean isSuspended) {
return (isSuspended && !killIfSuspended) ? false : shouldKillAfterStoreUpdate();
}
}
public interface ProcedureExecutorListener {
@ -503,6 +538,17 @@ public class ProcedureExecutor<TEnvironment> {
break;
case WAITING:
if (!proc.hasChildren()) {
// Normally, WAITING procedures should be waken by its children.
// But, there is a case that, all the children are successful and before
// they can wake up their parent procedure, the master was killed.
// So, during recovering the procedures from ProcedureWal, its children
// are not loaded because of their SUCCESS state.
// So we need to continue to run this WAITING procedure. But before
// executing, we need to set its state to RUNNABLE, otherwise, a exception
// will throw:
// Preconditions.checkArgument(procedure.getState() == ProcedureState.RUNNABLE,
// "NOT RUNNABLE! " + procedure.toString());
proc.setState(ProcedureState.RUNNABLE);
runnableList.add(proc);
}
break;
@ -1562,10 +1608,7 @@ public class ProcedureExecutor<TEnvironment> {
// allows to kill the executor before something is stored to the wal.
// useful to test the procedure recovery.
if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) {
String msg = "TESTING: Kill before store update: " + procedure;
LOG.debug(msg);
stop();
throw new RuntimeException(msg);
kill("TESTING: Kill BEFORE store update: " + procedure);
}
// TODO: The code here doesn't check if store is running before persisting to the store as
@ -1591,6 +1634,14 @@ public class ProcedureExecutor<TEnvironment> {
assert (reExecute && subprocs == null) || !reExecute;
} while (reExecute);
// Allows to kill the executor after something is stored to the WAL but before the below
// state settings are done -- in particular the one on the end where we make parent
// RUNNABLE again when its children are done; see countDownChildren.
if (testing != null && testing.shouldKillAfterStoreUpdate(suspended)) {
kill("TESTING: Kill AFTER store update: " + procedure);
}
// Submit the new subprocedures
if (subprocs != null && !procedure.isFailed()) {
submitChildrenProcedures(subprocs);
@ -1608,6 +1659,12 @@ public class ProcedureExecutor<TEnvironment> {
}
}
private void kill(String msg) {
LOG.debug(msg);
stop();
throw new RuntimeException(msg);
}
private Procedure<TEnvironment>[] initializeChildren(RootProcedureState<TEnvironment> procStack,
Procedure<TEnvironment> procedure, Procedure<TEnvironment>[] subprocs) {
assert subprocs != null : "expected subprocedures";

View File

@ -167,6 +167,13 @@ public class ProcedureTestingUtility {
assertSingleExecutorForKillTests(procExecutor);
}
public static <TEnv> void toggleKillAfterStoreUpdate(ProcedureExecutor<TEnv> procExecutor) {
createExecutorTesting(procExecutor);
procExecutor.testing.killAfterStoreUpdate = !procExecutor.testing.killAfterStoreUpdate;
LOG.warn("Set Kill after store update to: " + procExecutor.testing.killAfterStoreUpdate);
assertSingleExecutorForKillTests(procExecutor);
}
public static <TEnv> void setKillAndToggleBeforeStoreUpdate(ProcedureExecutor<TEnv> procExecutor,
boolean value) {
ProcedureTestingUtility.setKillBeforeStoreUpdate(procExecutor, value);

View File

@ -109,6 +109,29 @@ public class TestChildProcedures {
ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
}
/**
* Test the state setting that happens after store to WAL; in particular the bit where we
* set the parent runnable again after its children have all completed successfully.
* See HBASE-20978.
*/
@Test
public void testChildLoadWithRestartAfterChildSuccess() throws Exception {
procEnv.toggleKillAfterStoreUpdate = true;
TestRootProcedure proc = new TestRootProcedure();
long procId = ProcedureTestingUtility.submitAndWait(procExecutor, proc);
int restartCount = 0;
while (!procExecutor.isFinished(procId)) {
ProcedureTestingUtility.restart(procExecutor);
ProcedureTestingUtility.waitProcedure(procExecutor, proc);
restartCount++;
}
assertEquals(4, restartCount);
assertTrue("expected completed proc", procExecutor.isFinished(procId));
ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
}
@Test
public void testChildRollbackLoad() throws Exception {
procEnv.toggleKillBeforeStoreUpdate = false;
@ -154,6 +177,9 @@ public class TestChildProcedures {
if (env.toggleKillBeforeStoreUpdate) {
ProcedureTestingUtility.toggleKillBeforeStoreUpdate(procExecutor);
}
if (env.toggleKillAfterStoreUpdate) {
ProcedureTestingUtility.toggleKillAfterStoreUpdate(procExecutor);
}
return new Procedure[] { new TestChildProcedure(), new TestChildProcedure() };
}
@ -193,6 +219,7 @@ public class TestChildProcedures {
private static class TestProcEnv {
public boolean toggleKillBeforeStoreUpdate = false;
public boolean toggleKillAfterStoreUpdate = false;
public boolean triggerRollbackOnChild = false;
}
}

View File

@ -21,6 +21,10 @@
*/
-->
<configuration>
<property>
<name>hbase.defaults.for.version.skip</name>
<value>true</value>
</property>
<property>
<name>hbase.procedure.store.wal.use.hsync</name>
<value>false</value>