From bdca019b9e0b92348d6245169b968ed14dd505db Mon Sep 17 00:00:00 2001 From: Allan Yang Date: Tue, 14 Aug 2018 12:09:42 -0700 Subject: [PATCH] HBASE-20978 [amv2] Worker terminating UNNATURALLY during MoveRegionProcedure Signed-off-by: Michael Stack Signed-off-by: Duo Zhang --- .../hbase/procedure2/ProcedureExecutor.java | 65 +++++++++++++++++-- .../procedure2/ProcedureTestingUtility.java | 7 ++ .../hbase/procedure2/TestChildProcedures.java | 27 ++++++++ .../src/test/resources/hbase-site.xml | 4 ++ 4 files changed, 99 insertions(+), 4 deletions(-) diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java index 27bdb9edc0a..f773bf94a64 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java @@ -83,12 +83,34 @@ public class ProcedureExecutor { "hbase.procedure.worker.keep.alive.time.msec"; private static final long DEFAULT_WORKER_KEEP_ALIVE_TIME = TimeUnit.MINUTES.toMillis(1); + /** + * {@link #testing} is non-null when ProcedureExecutor is being tested. Tests will try to + * break PE having it fail at various junctures. When non-null, testing is set to an instance of + * the below internal {@link Testing} class with flags set for the particular test. + */ Testing testing = null; + + /** + * Class with parameters describing how to fail/die when in testing-context. + */ public static class Testing { protected boolean killIfSuspended = false; + + /** + * Kill the PE BEFORE we store state to the WAL. Good for figuring out if a Procedure is + * persisting all the state it needs to recover after a crash. + */ protected boolean killBeforeStoreUpdate = false; protected boolean toggleKillBeforeStoreUpdate = false; + /** + * Set when we want to fail AFTER state has been stored into the WAL. Rarely used. HBASE-20978 + * is about a case where memory-state was being set after store to WAL where a crash could + * cause us to get stuck. This flag allows killing at what was a vulnerable time. + */ + protected boolean killAfterStoreUpdate = false; + protected boolean toggleKillAfterStoreUpdate = false; + protected boolean shouldKillBeforeStoreUpdate() { final boolean kill = this.killBeforeStoreUpdate; if (this.toggleKillBeforeStoreUpdate) { @@ -101,6 +123,19 @@ public class ProcedureExecutor { protected boolean shouldKillBeforeStoreUpdate(final boolean isSuspended) { return (isSuspended && !killIfSuspended) ? false : shouldKillBeforeStoreUpdate(); } + + protected boolean shouldKillAfterStoreUpdate() { + final boolean kill = this.killAfterStoreUpdate; + if (this.toggleKillAfterStoreUpdate) { + this.killAfterStoreUpdate = !kill; + LOG.warn("Toggle KILL after store update to: " + this.killAfterStoreUpdate); + } + return kill; + } + + protected boolean shouldKillAfterStoreUpdate(final boolean isSuspended) { + return (isSuspended && !killIfSuspended) ? false : shouldKillAfterStoreUpdate(); + } } public interface ProcedureExecutorListener { @@ -503,6 +538,17 @@ public class ProcedureExecutor { break; case WAITING: if (!proc.hasChildren()) { + // Normally, WAITING procedures should be waken by its children. + // But, there is a case that, all the children are successful and before + // they can wake up their parent procedure, the master was killed. + // So, during recovering the procedures from ProcedureWal, its children + // are not loaded because of their SUCCESS state. + // So we need to continue to run this WAITING procedure. But before + // executing, we need to set its state to RUNNABLE, otherwise, a exception + // will throw: + // Preconditions.checkArgument(procedure.getState() == ProcedureState.RUNNABLE, + // "NOT RUNNABLE! " + procedure.toString()); + proc.setState(ProcedureState.RUNNABLE); runnableList.add(proc); } break; @@ -1562,10 +1608,7 @@ public class ProcedureExecutor { // allows to kill the executor before something is stored to the wal. // useful to test the procedure recovery. if (testing != null && testing.shouldKillBeforeStoreUpdate(suspended)) { - String msg = "TESTING: Kill before store update: " + procedure; - LOG.debug(msg); - stop(); - throw new RuntimeException(msg); + kill("TESTING: Kill BEFORE store update: " + procedure); } // TODO: The code here doesn't check if store is running before persisting to the store as @@ -1591,6 +1634,14 @@ public class ProcedureExecutor { assert (reExecute && subprocs == null) || !reExecute; } while (reExecute); + + // Allows to kill the executor after something is stored to the WAL but before the below + // state settings are done -- in particular the one on the end where we make parent + // RUNNABLE again when its children are done; see countDownChildren. + if (testing != null && testing.shouldKillAfterStoreUpdate(suspended)) { + kill("TESTING: Kill AFTER store update: " + procedure); + } + // Submit the new subprocedures if (subprocs != null && !procedure.isFailed()) { submitChildrenProcedures(subprocs); @@ -1608,6 +1659,12 @@ public class ProcedureExecutor { } } + private void kill(String msg) { + LOG.debug(msg); + stop(); + throw new RuntimeException(msg); + } + private Procedure[] initializeChildren(RootProcedureState procStack, Procedure procedure, Procedure[] subprocs) { assert subprocs != null : "expected subprocedures"; diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java index e8d72f9d636..138215b0e8f 100644 --- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java +++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/ProcedureTestingUtility.java @@ -167,6 +167,13 @@ public class ProcedureTestingUtility { assertSingleExecutorForKillTests(procExecutor); } + public static void toggleKillAfterStoreUpdate(ProcedureExecutor procExecutor) { + createExecutorTesting(procExecutor); + procExecutor.testing.killAfterStoreUpdate = !procExecutor.testing.killAfterStoreUpdate; + LOG.warn("Set Kill after store update to: " + procExecutor.testing.killAfterStoreUpdate); + assertSingleExecutorForKillTests(procExecutor); + } + public static void setKillAndToggleBeforeStoreUpdate(ProcedureExecutor procExecutor, boolean value) { ProcedureTestingUtility.setKillBeforeStoreUpdate(procExecutor, value); diff --git a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java index cce4caf1ce6..4f3c443faa9 100644 --- a/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java +++ b/hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestChildProcedures.java @@ -109,6 +109,29 @@ public class TestChildProcedures { ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId); } + + /** + * Test the state setting that happens after store to WAL; in particular the bit where we + * set the parent runnable again after its children have all completed successfully. + * See HBASE-20978. + */ + @Test + public void testChildLoadWithRestartAfterChildSuccess() throws Exception { + procEnv.toggleKillAfterStoreUpdate = true; + + TestRootProcedure proc = new TestRootProcedure(); + long procId = ProcedureTestingUtility.submitAndWait(procExecutor, proc); + int restartCount = 0; + while (!procExecutor.isFinished(procId)) { + ProcedureTestingUtility.restart(procExecutor); + ProcedureTestingUtility.waitProcedure(procExecutor, proc); + restartCount++; + } + assertEquals(4, restartCount); + assertTrue("expected completed proc", procExecutor.isFinished(procId)); + ProcedureTestingUtility.assertProcNotFailed(procExecutor, procId); + } + @Test public void testChildRollbackLoad() throws Exception { procEnv.toggleKillBeforeStoreUpdate = false; @@ -154,6 +177,9 @@ public class TestChildProcedures { if (env.toggleKillBeforeStoreUpdate) { ProcedureTestingUtility.toggleKillBeforeStoreUpdate(procExecutor); } + if (env.toggleKillAfterStoreUpdate) { + ProcedureTestingUtility.toggleKillAfterStoreUpdate(procExecutor); + } return new Procedure[] { new TestChildProcedure(), new TestChildProcedure() }; } @@ -193,6 +219,7 @@ public class TestChildProcedures { private static class TestProcEnv { public boolean toggleKillBeforeStoreUpdate = false; + public boolean toggleKillAfterStoreUpdate = false; public boolean triggerRollbackOnChild = false; } } diff --git a/hbase-procedure/src/test/resources/hbase-site.xml b/hbase-procedure/src/test/resources/hbase-site.xml index 114ee8a23c1..3709a719bae 100644 --- a/hbase-procedure/src/test/resources/hbase-site.xml +++ b/hbase-procedure/src/test/resources/hbase-site.xml @@ -21,6 +21,10 @@ */ --> + + hbase.defaults.for.version.skip + true + hbase.procedure.store.wal.use.hsync false