HBASE-20978 [amv2] Worker terminating UNNATURALLY during MoveRegionProcedure
Signed-off-by: Michael Stack <stack@apache.org> Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
parent
a83073aff0
commit
bdca019b9e
|
@ -83,12 +83,34 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
"hbase.procedure.worker.keep.alive.time.msec";
|
||||
private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = TimeUnit.MINUTES.toMillis(1);
|
||||
|
||||
/**
|
||||
* {@link #testing} is non-null when ProcedureExecutor is being tested. Tests will try to
|
||||
* break PE having it fail at various junctures. When non-null, testing is set to an instance of
|
||||
* the below internal {@link Testing} class with flags set for the particular test.
|
||||
*/
|
||||
Testing testing = null;
|
||||
|
||||
/**
|
||||
* Class with parameters describing how to fail/die when in testing-context.
|
||||
*/
|
||||
public static class Testing {
|
||||
protected boolean killIfSuspended = false;
|
||||
|
||||
/**
|
||||
* Kill the PE BEFORE we store state to the WAL. Good for figuring out if a Procedure is
|
||||
* persisting all the state it needs to recover after a crash.
|
||||
*/
|
||||
protected boolean killBeforeStoreUpdate = false;
|
||||
protected boolean toggleKillBeforeStoreUpdate = false;
|
||||
|
||||
/**
|
||||
* Set when we want to fail AFTER state has been stored into the WAL. Rarely used. HBASE-20978
|
||||
* is about a case where memory-state was being set after store to WAL where a crash could
|
||||
* cause us to get stuck. This flag allows killing at what was a vulnerable time.
|
||||
*/
|
||||
protected boolean killAfterStoreUpdate = false;
|
||||
protected boolean toggleKillAfterStoreUpdate = false;
|
||||
|
||||
protected boolean shouldKillBeforeStoreUpdate() {
|
||||
final boolean kill = this.killBeforeStoreUpdate;
|
||||
if (this.toggleKillBeforeStoreUpdate) {
|
||||
|
@ -101,6 +123,19 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) {
|
||||
return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate();
|
||||
}
|
||||
|
||||
protected boolean shouldKillAfterStoreUpdate() {
|
||||
final boolean kill = this.killAfterStoreUpdate;
|
||||
if (this.toggleKillAfterStoreUpdate) {
|
||||
this.killAfterStoreUpdate = !kill;
|
||||
LOG.warn("Toggle KILL after store update to: " + this.killAfterStoreUpdate);
|
||||
}
|
||||
return kill;
|
||||
}
|
||||
|
||||
protected boolean shouldKillAfterStoreUpdate(final boolean isSuspended) {
|
||||
return (isSuspended && !killIfSuspended) ? false : shouldKillAfterStoreUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public interface ProcedureExecutorListener {
|
||||
|
@ -503,6 +538,17 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
break;
|
||||
case WAITING:
|
||||
if (!proc.hasChildren()) {
|
||||
// Normally, WAITING procedures should be waken by its children.
|
||||
// But, there is a case that, all the children are successful and before
|
||||
// they can wake up their parent procedure, the master was killed.
|
||||
// So, during recovering the procedures from ProcedureWal, its children
|
||||
// are not loaded because of their SUCCESS state.
|
||||
// So we need to continue to run this WAITING procedure. But before
|
||||
// executing, we need to set its state to RUNNABLE, otherwise, a exception
|
||||
// will throw:
|
||||
// Preconditions.checkArgument(procedure.getState() == ProcedureState.RUNNABLE,
|
||||
// "NOT RUNNABLE! " + procedure.toString());
|
||||
proc.setState(ProcedureState.RUNNABLE);
|
||||
runnableList.add(proc);
|
||||
}
|
||||
break;
|
||||
|
@ -1562,10 +1608,7 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
// allows to kill the executor before something is stored to the wal.
|
||||
// useful to test the procedure recovery.
|
||||
if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) {
|
||||
String msg = "TESTING: Kill before store update: " + procedure;
|
||||
LOG.debug(msg);
|
||||
stop();
|
||||
throw new RuntimeException(msg);
|
||||
kill("TESTING: Kill BEFORE store update: " + procedure);
|
||||
}
|
||||
|
||||
// TODO: The code here doesn't check if store is running before persisting to the store as
|
||||
|
@ -1591,6 +1634,14 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
|
||||
assert (reExecute && subprocs == null) || !reExecute;
|
||||
} while (reExecute);
|
||||
|
||||
// Allows to kill the executor after something is stored to the WAL but before the below
|
||||
// state settings are done -- in particular the one on the end where we make parent
|
||||
// RUNNABLE again when its children are done; see countDownChildren.
|
||||
if (testing != null && testing.shouldKillAfterStoreUpdate(suspended)) {
|
||||
kill("TESTING: Kill AFTER store update: " + procedure);
|
||||
}
|
||||
|
||||
// Submit the new subprocedures
|
||||
if (subprocs != null && !procedure.isFailed()) {
|
||||
submitChildrenProcedures(subprocs);
|
||||
|
@ -1608,6 +1659,12 @@ public class ProcedureExecutor<TEnvironment> {
|
|||
}
|
||||
}
|
||||
|
||||
private void kill(String msg) {
|
||||
LOG.debug(msg);
|
||||
stop();
|
||||
throw new RuntimeException(msg);
|
||||
}
|
||||
|
||||
private Procedure<TEnvironment>[] initializeChildren(RootProcedureState<TEnvironment> procStack,
|
||||
Procedure<TEnvironment> procedure, Procedure<TEnvironment>[] subprocs) {
|
||||
assert subprocs != null : "expected subprocedures";
|
||||
|
|
|
@ -167,6 +167,13 @@ public class ProcedureTestingUtility {
|
|||
assertSingleExecutorForKillTests(procExecutor);
|
||||
}
|
||||
|
||||
public static <TEnv> void toggleKillAfterStoreUpdate(ProcedureExecutor<TEnv> procExecutor) {
|
||||
createExecutorTesting(procExecutor);
|
||||
procExecutor.testing.killAfterStoreUpdate = !procExecutor.testing.killAfterStoreUpdate;
|
||||
LOG.warn("Set Kill after store update to: " + procExecutor.testing.killAfterStoreUpdate);
|
||||
assertSingleExecutorForKillTests(procExecutor);
|
||||
}
|
||||
|
||||
public static <TEnv> void setKillAndToggleBeforeStoreUpdate(ProcedureExecutor<TEnv> procExecutor,
|
||||
boolean value) {
|
||||
ProcedureTestingUtility.setKillBeforeStoreUpdate(procExecutor, value);
|
||||
|
|
|
@ -109,6 +109,29 @@ public class TestChildProcedures {
|
|||
ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test the state setting that happens after store to WAL; in particular the bit where we
|
||||
* set the parent runnable again after its children have all completed successfully.
|
||||
* See HBASE-20978.
|
||||
*/
|
||||
@Test
|
||||
public void testChildLoadWithRestartAfterChildSuccess() throws Exception {
|
||||
procEnv.toggleKillAfterStoreUpdate = true;
|
||||
|
||||
TestRootProcedure proc = new TestRootProcedure();
|
||||
long procId = ProcedureTestingUtility.submitAndWait(procExecutor, proc);
|
||||
int restartCount = 0;
|
||||
while (!procExecutor.isFinished(procId)) {
|
||||
ProcedureTestingUtility.restart(procExecutor);
|
||||
ProcedureTestingUtility.waitProcedure(procExecutor, proc);
|
||||
restartCount++;
|
||||
}
|
||||
assertEquals(4, restartCount);
|
||||
assertTrue("expected completed proc", procExecutor.isFinished(procId));
|
||||
ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testChildRollbackLoad() throws Exception {
|
||||
procEnv.toggleKillBeforeStoreUpdate = false;
|
||||
|
@ -154,6 +177,9 @@ public class TestChildProcedures {
|
|||
if (env.toggleKillBeforeStoreUpdate) {
|
||||
ProcedureTestingUtility.toggleKillBeforeStoreUpdate(procExecutor);
|
||||
}
|
||||
if (env.toggleKillAfterStoreUpdate) {
|
||||
ProcedureTestingUtility.toggleKillAfterStoreUpdate(procExecutor);
|
||||
}
|
||||
return new Procedure[] { new TestChildProcedure(), new TestChildProcedure() };
|
||||
}
|
||||
|
||||
|
@ -193,6 +219,7 @@ public class TestChildProcedures {
|
|||
|
||||
private static class TestProcEnv {
|
||||
public boolean toggleKillBeforeStoreUpdate = false;
|
||||
public boolean toggleKillAfterStoreUpdate = false;
|
||||
public boolean triggerRollbackOnChild = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,10 @@
|
|||
*/
|
||||
-->
|
||||
<configuration>
|
||||
<property>
|
||||
<name>hbase.defaults.for.version.skip</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hbase.procedure.store.wal.use.hsync</name>
|
||||
<value>false</value>
|
||||
|
|
Loading…
Reference in New Issue