HBASE-14802 Replaying server crash recovery procedure after a failover causes incorrect handling of deadservers

This commit is contained in:
stack 2015-11-13 22:06:23 -08:00
parent 8c9fc6eb97
commit 7c3c9ac9c6
3 changed files with 59 additions and 6 deletions

View File

@ -58,6 +58,11 @@ public class DeadServer {
*/ */
private int numProcessing = 0; private int numProcessing = 0;
/**
* Whether a dead server is being processed currently.
*/
private boolean processing = false;
/** /**
* A dead server that comes back alive has a different start code. The new start code should be * A dead server that comes back alive has a different start code. The new start code should be
* greater than the old one, but we don't take this into account in this method. * greater than the old one, but we don't take this into account in this method.
@ -94,9 +99,7 @@ public class DeadServer {
* *
* @return true if any RS are being processed as dead * @return true if any RS are being processed as dead
*/ */
public synchronized boolean areDeadServersInProgress() { public synchronized boolean areDeadServersInProgress() { return processing; }
return numProcessing != 0;
}
public synchronized Set<ServerName> copyServerNames() { public synchronized Set<ServerName> copyServerNames() {
Set<ServerName> clone = new HashSet<ServerName>(deadServers.size()); Set<ServerName> clone = new HashSet<ServerName>(deadServers.size());
@ -109,15 +112,34 @@ public class DeadServer {
* @param sn the server name * @param sn the server name
*/ */
public synchronized void add(ServerName sn) { public synchronized void add(ServerName sn) {
this.numProcessing++; processing = true;
if (!deadServers.containsKey(sn)){ if (!deadServers.containsKey(sn)){
deadServers.put(sn, EnvironmentEdgeManager.currentTime()); deadServers.put(sn, EnvironmentEdgeManager.currentTime());
} }
} }
/**
* Notify that we started processing this dead server.
* @param sn ServerName for the dead server.
*/
public synchronized void notifyServer(ServerName sn) {
if (LOG.isDebugEnabled()) { LOG.debug("Started processing " + sn); }
processing = true;
numProcessing++;
}
public synchronized void finish(ServerName sn) { public synchronized void finish(ServerName sn) {
if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + this.numProcessing); numProcessing--;
this.numProcessing--; if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + numProcessing);
assert numProcessing >= 0: "Number of dead servers in processing should always be non-negative";
if (numProcessing < 0) {
LOG.error("Number of dead servers in processing = " + numProcessing
+ ". Something went wrong, this should always be non-negative.");
numProcessing = 0;
}
if (numProcessing == 0) { processing = false; }
} }
public synchronized int size() { public synchronized int size() {

View File

@ -109,6 +109,11 @@ implements ServerProcedureInterface {
*/ */
private ServerName serverName; private ServerName serverName;
/**
* Whether DeadServer knows that we are processing it.
*/
private boolean notifiedDeadServer = false;
/** /**
* Regions that were on the crashed server. * Regions that were on the crashed server.
*/ */
@ -183,6 +188,13 @@ implements ServerProcedureInterface {
if (!services.getAssignmentManager().isFailoverCleanupDone()) { if (!services.getAssignmentManager().isFailoverCleanupDone()) {
throwProcedureYieldException("Waiting on master failover to complete"); throwProcedureYieldException("Waiting on master failover to complete");
} }
// HBASE-14802
// If we have not yet notified that we are processing a dead server, we should do now.
if (!notifiedDeadServer) {
services.getServerManager().getDeadServers().notifyServer(serverName);
notifiedDeadServer = true;
}
try { try {
switch (state) { switch (state) {
case SERVER_CRASH_START: case SERVER_CRASH_START:

View File

@ -17,7 +17,11 @@
*/ */
package org.apache.hadoop.hbase.master; package org.apache.hadoop.hbase.master;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
@ -43,16 +47,19 @@ public class TestDeadServer {
@Test public void testIsDead() { @Test public void testIsDead() {
DeadServer ds = new DeadServer(); DeadServer ds = new DeadServer();
ds.add(hostname123); ds.add(hostname123);
ds.notifyServer(hostname123);
assertTrue(ds.areDeadServersInProgress()); assertTrue(ds.areDeadServersInProgress());
ds.finish(hostname123); ds.finish(hostname123);
assertFalse(ds.areDeadServersInProgress()); assertFalse(ds.areDeadServersInProgress());
ds.add(hostname1234); ds.add(hostname1234);
ds.notifyServer(hostname1234);
assertTrue(ds.areDeadServersInProgress()); assertTrue(ds.areDeadServersInProgress());
ds.finish(hostname1234); ds.finish(hostname1234);
assertFalse(ds.areDeadServersInProgress()); assertFalse(ds.areDeadServersInProgress());
ds.add(hostname12345); ds.add(hostname12345);
ds.notifyServer(hostname12345);
assertTrue(ds.areDeadServersInProgress()); assertTrue(ds.areDeadServersInProgress());
ds.finish(hostname12345); ds.finish(hostname12345);
assertFalse(ds.areDeadServersInProgress()); assertFalse(ds.areDeadServersInProgress());
@ -75,6 +82,18 @@ public class TestDeadServer {
assertFalse(ds.cleanPreviousInstance(deadServerHostComingAlive)); assertFalse(ds.cleanPreviousInstance(deadServerHostComingAlive));
} }
@Test(timeout = 15000)
public void testCrashProcedureReplay() throws Exception {
HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
TEST_UTIL.startMiniCluster();
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
ProcedureExecutor pExecutor = master.getMasterProcedureExecutor();
ServerCrashProcedure proc = new ServerCrashProcedure(hostname123, false, false);
ProcedureTestingUtility.submitAndWait(pExecutor, proc);
assertFalse(master.getServerManager().getDeadServers().areDeadServersInProgress());
}
@Test @Test
public void testSortExtract(){ public void testSortExtract(){