HBASE-20978 [amv2] Worker terminating UNNATURALLY during MoveRegionProcedure

Signed-off-by: Michael Stack <stack@apache.org>
Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
Allan Yang 2018-08-14 12:09:42 -07:00 committed by zhangduo
parent a83073aff0
commit bdca019b9e
4 changed files with 99 additions and 4 deletions

View File

@ -83,12 +83,34 @@ public class ProcedureExecutor<TEnvironment> {
"hbase.procedure.worker.keep.alive.time.msec"; "hbase.procedure.worker.keep.alive.time.msec";
private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = TimeUnit.MINUTES.toMillis(1); private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = TimeUnit.MINUTES.toMillis(1);
/**
* {@link #testing} is non-null when ProcedureExecutor is being tested. Tests will try to
* break PE having it fail at various junctures. When non-null, testing is set to an instance of
* the below internal {@link Testing} class with flags set for the particular test.
*/
Testing testing = null; Testing testing = null;
/**
* Class with parameters describing how to fail/die when in testing-context.
*/
public static class Testing { public static class Testing {
protected boolean killIfSuspended = false; protected boolean killIfSuspended = false;
/**
* Kill the PE BEFORE we store state to the WAL. Good for figuring out if a Procedure is
* persisting all the state it needs to recover after a crash.
*/
protected boolean killBeforeStoreUpdate = false; protected boolean killBeforeStoreUpdate = false;
protected boolean toggleKillBeforeStoreUpdate = false; protected boolean toggleKillBeforeStoreUpdate = false;
/**
* Set when we want to fail AFTER state has been stored into the WAL. Rarely used. HBASE-20978
* is about a case where memory-state was being set after store to WAL where a crash could
* cause us to get stuck. This flag allows killing at what was a vulnerable time.
*/
protected boolean killAfterStoreUpdate = false;
protected boolean toggleKillAfterStoreUpdate = false;
protected boolean shouldKillBeforeStoreUpdate() { protected boolean shouldKillBeforeStoreUpdate() {
final boolean kill = this.killBeforeStoreUpdate; final boolean kill = this.killBeforeStoreUpdate;
if (this.toggleKillBeforeStoreUpdate) { if (this.toggleKillBeforeStoreUpdate) {
@ -101,6 +123,19 @@ public class ProcedureExecutor<TEnvironment> {
protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) { protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) {
return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate(); return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate();
} }
protected boolean shouldKillAfterStoreUpdate() {
final boolean kill = this.killAfterStoreUpdate;
if (this.toggleKillAfterStoreUpdate) {
this.killAfterStoreUpdate = !kill;
LOG.warn("Toggle KILL after store update to: " + this.killAfterStoreUpdate);
}
return kill;
}
protected boolean shouldKillAfterStoreUpdate(final boolean isSuspended) {
return (isSuspended && !killIfSuspended) ? false : shouldKillAfterStoreUpdate();
}
} }
public interface ProcedureExecutorListener { public interface ProcedureExecutorListener {
@ -503,6 +538,17 @@ public class ProcedureExecutor<TEnvironment> {
break; break;
case WAITING: case WAITING:
if (!proc.hasChildren()) { if (!proc.hasChildren()) {
// Normally, WAITING procedures should be waken by its children.
// But, there is a case that, all the children are successful and before
// they can wake up their parent procedure, the master was killed.
// So, during recovering the procedures from ProcedureWal, its children
// are not loaded because of their SUCCESS state.
// So we need to continue to run this WAITING procedure. But before
// executing, we need to set its state to RUNNABLE, otherwise, a exception
// will throw:
// Preconditions.checkArgument(procedure.getState() == ProcedureState.RUNNABLE,
// "NOT RUNNABLE! " + procedure.toString());
proc.setState(ProcedureState.RUNNABLE);
runnableList.add(proc); runnableList.add(proc);
} }
break; break;
@ -1562,10 +1608,7 @@ public class ProcedureExecutor<TEnvironment> {
// allows to kill the executor before something is stored to the wal. // allows to kill the executor before something is stored to the wal.
// useful to test the procedure recovery. // useful to test the procedure recovery.
if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) { if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) {
String msg = "TESTING: Kill before store update: " + procedure; kill("TESTING: Kill BEFORE store update: " + procedure);
LOG.debug(msg);
stop();
throw new RuntimeException(msg);
} }
// TODO: The code here doesn't check if store is running before persisting to the store as // TODO: The code here doesn't check if store is running before persisting to the store as
@ -1591,6 +1634,14 @@ public class ProcedureExecutor<TEnvironment> {
assert (reExecute && subprocs == null) || !reExecute; assert (reExecute && subprocs == null) || !reExecute;
} while (reExecute); } while (reExecute);
// Allows to kill the executor after something is stored to the WAL but before the below
// state settings are done -- in particular the one on the end where we make parent
// RUNNABLE again when its children are done; see countDownChildren.
if (testing != null && testing.shouldKillAfterStoreUpdate(suspended)) {
kill("TESTING: Kill AFTER store update: " + procedure);
}
// Submit the new subprocedures // Submit the new subprocedures
if (subprocs != null && !procedure.isFailed()) { if (subprocs != null && !procedure.isFailed()) {
submitChildrenProcedures(subprocs); submitChildrenProcedures(subprocs);
@ -1608,6 +1659,12 @@ public class ProcedureExecutor<TEnvironment> {
} }
} }
private void kill(String msg) {
LOG.debug(msg);
stop();
throw new RuntimeException(msg);
}
private Procedure<TEnvironment>[] initializeChildren(RootProcedureState<TEnvironment> procStack, private Procedure<TEnvironment>[] initializeChildren(RootProcedureState<TEnvironment> procStack,
Procedure<TEnvironment> procedure, Procedure<TEnvironment>[] subprocs) { Procedure<TEnvironment> procedure, Procedure<TEnvironment>[] subprocs) {
assert subprocs != null : "expected subprocedures"; assert subprocs != null : "expected subprocedures";

View File

@ -167,6 +167,13 @@ public class ProcedureTestingUtility {
assertSingleExecutorForKillTests(procExecutor); assertSingleExecutorForKillTests(procExecutor);
} }
public static <TEnv> void toggleKillAfterStoreUpdate(ProcedureExecutor<TEnv> procExecutor) {
createExecutorTesting(procExecutor);
procExecutor.testing.killAfterStoreUpdate = !procExecutor.testing.killAfterStoreUpdate;
LOG.warn("Set Kill after store update to: " + procExecutor.testing.killAfterStoreUpdate);
assertSingleExecutorForKillTests(procExecutor);
}
public static <TEnv> void setKillAndToggleBeforeStoreUpdate(ProcedureExecutor<TEnv> procExecutor, public static <TEnv> void setKillAndToggleBeforeStoreUpdate(ProcedureExecutor<TEnv> procExecutor,
boolean value) { boolean value) {
ProcedureTestingUtility.setKillBeforeStoreUpdate(procExecutor, value); ProcedureTestingUtility.setKillBeforeStoreUpdate(procExecutor, value);

View File

@ -109,6 +109,29 @@ public class TestChildProcedures {
ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId); ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
} }
/**
* Test the state setting that happens after store to WAL; in particular the bit where we
* set the parent runnable again after its children have all completed successfully.
* See HBASE-20978.
*/
@Test
public void testChildLoadWithRestartAfterChildSuccess() throws Exception {
procEnv.toggleKillAfterStoreUpdate = true;
TestRootProcedure proc = new TestRootProcedure();
long procId = ProcedureTestingUtility.submitAndWait(procExecutor, proc);
int restartCount = 0;
while (!procExecutor.isFinished(procId)) {
ProcedureTestingUtility.restart(procExecutor);
ProcedureTestingUtility.waitProcedure(procExecutor, proc);
restartCount++;
}
assertEquals(4, restartCount);
assertTrue("expected completed proc", procExecutor.isFinished(procId));
ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
}
@Test @Test
public void testChildRollbackLoad() throws Exception { public void testChildRollbackLoad() throws Exception {
procEnv.toggleKillBeforeStoreUpdate = false; procEnv.toggleKillBeforeStoreUpdate = false;
@ -154,6 +177,9 @@ public class TestChildProcedures {
if (env.toggleKillBeforeStoreUpdate) { if (env.toggleKillBeforeStoreUpdate) {
ProcedureTestingUtility.toggleKillBeforeStoreUpdate(procExecutor); ProcedureTestingUtility.toggleKillBeforeStoreUpdate(procExecutor);
} }
if (env.toggleKillAfterStoreUpdate) {
ProcedureTestingUtility.toggleKillAfterStoreUpdate(procExecutor);
}
return new Procedure[] { new TestChildProcedure(), new TestChildProcedure() }; return new Procedure[] { new TestChildProcedure(), new TestChildProcedure() };
} }
@ -193,6 +219,7 @@ public class TestChildProcedures {
private static class TestProcEnv { private static class TestProcEnv {
public boolean toggleKillBeforeStoreUpdate = false; public boolean toggleKillBeforeStoreUpdate = false;
public boolean toggleKillAfterStoreUpdate = false;
public boolean triggerRollbackOnChild = false; public boolean triggerRollbackOnChild = false;
} }
} }

View File

@ -21,6 +21,10 @@
*/ */
--> -->
<configuration> <configuration>
<property>
<name>hbase.defaults.for.version.skip</name>
<value>true</value>
</property>
<property> <property>
<name>hbase.procedure.store.wal.use.hsync</name> <name>hbase.procedure.store.wal.use.hsync</name>
<value>false</value> <value>false</value>