diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java index 8b16b005484..c33cdcc951f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/DeadServer.java @@ -58,6 +58,11 @@ public class DeadServer { */ private int numProcessing = 0; + /** + * Whether a dead server is being processed currently. + */ + private boolean processing = false; + /** * A dead server that comes back alive has a different start code. The new start code should be * greater than the old one, but we don't take this into account in this method. @@ -94,9 +99,7 @@ public class DeadServer { * * @return true if any RS are being processed as dead */ - public synchronized boolean areDeadServersInProgress() { - return numProcessing != 0; - } + public synchronized boolean areDeadServersInProgress() { return processing; } public synchronized Set copyServerNames() { Set clone = new HashSet(deadServers.size()); @@ -109,15 +112,34 @@ public class DeadServer { * @param sn the server name */ public synchronized void add(ServerName sn) { - this.numProcessing++; + processing = true; if (!deadServers.containsKey(sn)){ deadServers.put(sn, EnvironmentEdgeManager.currentTime()); } } + /** + * Notify that we started processing this dead server. + * @param sn ServerName for the dead server. + */ + public synchronized void notifyServer(ServerName sn) { + if (LOG.isDebugEnabled()) { LOG.debug("Started processing " + sn); } + processing = true; + numProcessing++; + } + public synchronized void finish(ServerName sn) { - if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + this.numProcessing); - this.numProcessing--; + numProcessing--; + if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + numProcessing); + + assert numProcessing >= 0: "Number of dead servers in processing should always be non-negative"; + + if (numProcessing < 0) { + LOG.error("Number of dead servers in processing = " + numProcessing + + ". Something went wrong, this should always be non-negative."); + numProcessing = 0; + } + if (numProcessing == 0) { processing = false; } } public synchronized int size() { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 1e86a239643..5c9f6f40b57 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -109,6 +109,11 @@ implements ServerProcedureInterface { */ private ServerName serverName; + /** + * Whether DeadServer knows that we are processing it. + */ + private boolean notifiedDeadServer = false; + /** * Regions that were on the crashed server. */ @@ -183,6 +188,13 @@ implements ServerProcedureInterface { if (!services.getAssignmentManager().isFailoverCleanupDone()) { throwProcedureYieldException("Waiting on master failover to complete"); } + // HBASE-14802 + // If we have not yet notified that we are processing a dead server, we should do now. + if (!notifiedDeadServer) { + services.getServerManager().getDeadServers().notifyServer(serverName); + notifiedDeadServer = true; + } + try { switch (state) { case SERVER_CRASH_START: diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java index 40d26f4c80f..09122a1016c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDeadServer.java @@ -17,7 +17,11 @@ */ package org.apache.hadoop.hbase.master; +import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; @@ -43,16 +47,19 @@ public class TestDeadServer { @Test public void testIsDead() { DeadServer ds = new DeadServer(); ds.add(hostname123); + ds.notifyServer(hostname123); assertTrue(ds.areDeadServersInProgress()); ds.finish(hostname123); assertFalse(ds.areDeadServersInProgress()); ds.add(hostname1234); + ds.notifyServer(hostname1234); assertTrue(ds.areDeadServersInProgress()); ds.finish(hostname1234); assertFalse(ds.areDeadServersInProgress()); ds.add(hostname12345); + ds.notifyServer(hostname12345); assertTrue(ds.areDeadServersInProgress()); ds.finish(hostname12345); assertFalse(ds.areDeadServersInProgress()); @@ -75,6 +82,18 @@ public class TestDeadServer { assertFalse(ds.cleanPreviousInstance(deadServerHostComingAlive)); } + @Test(timeout = 15000) + public void testCrashProcedureReplay() throws Exception { + HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + TEST_UTIL.startMiniCluster(); + HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); + ProcedureExecutor pExecutor = master.getMasterProcedureExecutor(); + ServerCrashProcedure proc = new ServerCrashProcedure(hostname123, false, false); + + ProcedureTestingUtility.submitAndWait(pExecutor, proc); + + assertFalse(master.getServerManager().getDeadServers().areDeadServersInProgress()); + } @Test public void testSortExtract(){