HBASE-14802 Replaying server crash recovery procedure after a failover causes incorrect handling of deadservers; Reapply of slightly different patch... see issue

This commit is contained in:
stack 2015-11-15 14:47:07 -08:00
parent bb6581345f
commit dd5f454b03
3 changed files with 71 additions and 6 deletions

View File

@ -58,6 +58,11 @@ public class DeadServer {
*/
private int numProcessing = 0;
/**
* Whether a dead server is being processed currently.
*/
private boolean processing = false;
/**
* A dead server that comes back alive has a different start code. The new start code should be
* greater than the old one, but we don't take this into account in this method.
@ -94,9 +99,7 @@ public class DeadServer {
*
* @return true if any RS are being processed as dead
*/
public synchronized boolean areDeadServersInProgress() {
return numProcessing != 0;
}
public synchronized boolean areDeadServersInProgress() { return processing; }
public synchronized Set<ServerName> copyServerNames() {
Set<ServerName> clone = new HashSet<ServerName>(deadServers.size());
@ -109,15 +112,34 @@ public class DeadServer {
* @param sn the server name
*/
public synchronized void add(ServerName sn) {
this.numProcessing++;
processing = true;
if (!deadServers.containsKey(sn)){
deadServers.put(sn, EnvironmentEdgeManager.currentTime());
}
}
/**
* Notify that we started processing this dead server.
* @param sn ServerName for the dead server.
*/
public synchronized void notifyServer(ServerName sn) {
if (LOG.isDebugEnabled()) { LOG.debug("Started processing " + sn); }
processing = true;
numProcessing++;
}
public synchronized void finish(ServerName sn) {
if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + this.numProcessing);
this.numProcessing--;
numProcessing--;
if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + numProcessing);
assert numProcessing >= 0: "Number of dead servers in processing should always be non-negative";
if (numProcessing < 0) {
LOG.error("Number of dead servers in processing = " + numProcessing
+ ". Something went wrong, this should always be non-negative.");
numProcessing = 0;
}
if (numProcessing == 0) { processing = false; }
}
public synchronized int size() {

View File

@ -109,6 +109,11 @@ implements ServerProcedureInterface {
*/
private ServerName serverName;
/**
* Whether DeadServer knows that we are processing it.
*/
private boolean notifiedDeadServer = false;
/**
* Regions that were on the crashed server.
*/
@ -183,6 +188,13 @@ implements ServerProcedureInterface {
if (!services.getAssignmentManager().isFailoverCleanupDone()) {
throwProcedureYieldException("Waiting on master failover to complete");
}
// HBASE-14802
// If we have not yet notified that we are processing a dead server, we should do now.
if (!notifiedDeadServer) {
services.getServerManager().getDeadServers().notifyServer(serverName);
notifiedDeadServer = true;
}
try {
switch (state) {
case SERVER_CRASH_START:

View File

@ -17,13 +17,19 @@
*/
package org.apache.hadoop.hbase.master;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.ManualEnvironmentEdge;
import org.apache.hadoop.hbase.util.Pair;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@ -35,24 +41,39 @@ import static org.junit.Assert.assertTrue;
@Category({MasterTests.class, MediumTests.class})
public class TestDeadServer {
private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
final ServerName hostname123 = ServerName.valueOf("127.0.0.1", 123, 3L);
final ServerName hostname123_2 = ServerName.valueOf("127.0.0.1", 123, 4L);
final ServerName hostname1234 = ServerName.valueOf("127.0.0.2", 1234, 4L);
final ServerName hostname12345 = ServerName.valueOf("127.0.0.2", 12345, 4L);
@BeforeClass
public static void setupBeforeClass() throws Exception {
TEST_UTIL.startMiniCluster();
}
@AfterClass
public static void tearDownAfterClass() throws Exception {
TEST_UTIL.shutdownMiniCluster();
}
@Test public void testIsDead() {
DeadServer ds = new DeadServer();
ds.add(hostname123);
ds.notifyServer(hostname123);
assertTrue(ds.areDeadServersInProgress());
ds.finish(hostname123);
assertFalse(ds.areDeadServersInProgress());
ds.add(hostname1234);
ds.notifyServer(hostname1234);
assertTrue(ds.areDeadServersInProgress());
ds.finish(hostname1234);
assertFalse(ds.areDeadServersInProgress());
ds.add(hostname12345);
ds.notifyServer(hostname12345);
assertTrue(ds.areDeadServersInProgress());
ds.finish(hostname12345);
assertFalse(ds.areDeadServersInProgress());
@ -75,6 +96,16 @@ public class TestDeadServer {
assertFalse(ds.cleanPreviousInstance(deadServerHostComingAlive));
}
@Test(timeout = 15000)
public void testCrashProcedureReplay() {
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
ProcedureExecutor pExecutor = master.getMasterProcedureExecutor();
ServerCrashProcedure proc = new ServerCrashProcedure(hostname123, false, false);
ProcedureTestingUtility.submitAndWait(pExecutor, proc);
assertFalse(master.getServerManager().getDeadServers().areDeadServersInProgress());
}
@Test
public void testSortExtract(){