HBASE-14802 Replaying server crash recovery procedure after a failover causes incorrect handling of deadservers; Reapply of slightly different patch... see issue
This commit is contained in:
parent
bb6581345f
commit
dd5f454b03
|
@ -58,6 +58,11 @@ public class DeadServer {
|
|||
*/
|
||||
private int numProcessing = 0;
|
||||
|
||||
/**
|
||||
* Whether a dead server is being processed currently.
|
||||
*/
|
||||
private boolean processing = false;
|
||||
|
||||
/**
|
||||
* A dead server that comes back alive has a different start code. The new start code should be
|
||||
* greater than the old one, but we don't take this into account in this method.
|
||||
|
@ -94,9 +99,7 @@ public class DeadServer {
|
|||
*
|
||||
* @return true if any RS are being processed as dead
|
||||
*/
|
||||
public synchronized boolean areDeadServersInProgress() {
|
||||
return numProcessing != 0;
|
||||
}
|
||||
public synchronized boolean areDeadServersInProgress() { return processing; }
|
||||
|
||||
public synchronized Set<ServerName> copyServerNames() {
|
||||
Set<ServerName> clone = new HashSet<ServerName>(deadServers.size());
|
||||
|
@ -109,15 +112,34 @@ public class DeadServer {
|
|||
* @param sn the server name
|
||||
*/
|
||||
public synchronized void add(ServerName sn) {
|
||||
this.numProcessing++;
|
||||
processing = true;
|
||||
if (!deadServers.containsKey(sn)){
|
||||
deadServers.put(sn, EnvironmentEdgeManager.currentTime());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Notify that we started processing this dead server.
|
||||
* @param sn ServerName for the dead server.
|
||||
*/
|
||||
public synchronized void notifyServer(ServerName sn) {
|
||||
if (LOG.isDebugEnabled()) { LOG.debug("Started processing " + sn); }
|
||||
processing = true;
|
||||
numProcessing++;
|
||||
}
|
||||
|
||||
public synchronized void finish(ServerName sn) {
|
||||
if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + this.numProcessing);
|
||||
this.numProcessing--;
|
||||
numProcessing--;
|
||||
if (LOG.isDebugEnabled()) LOG.debug("Finished " + sn + "; numProcessing=" + numProcessing);
|
||||
|
||||
assert numProcessing >= 0: "Number of dead servers in processing should always be non-negative";
|
||||
|
||||
if (numProcessing < 0) {
|
||||
LOG.error("Number of dead servers in processing = " + numProcessing
|
||||
+ ". Something went wrong, this should always be non-negative.");
|
||||
numProcessing = 0;
|
||||
}
|
||||
if (numProcessing == 0) { processing = false; }
|
||||
}
|
||||
|
||||
public synchronized int size() {
|
||||
|
|
|
@ -109,6 +109,11 @@ implements ServerProcedureInterface {
|
|||
*/
|
||||
private ServerName serverName;
|
||||
|
||||
/**
|
||||
* Whether DeadServer knows that we are processing it.
|
||||
*/
|
||||
private boolean notifiedDeadServer = false;
|
||||
|
||||
/**
|
||||
* Regions that were on the crashed server.
|
||||
*/
|
||||
|
@ -183,6 +188,13 @@ implements ServerProcedureInterface {
|
|||
if (!services.getAssignmentManager().isFailoverCleanupDone()) {
|
||||
throwProcedureYieldException("Waiting on master failover to complete");
|
||||
}
|
||||
// HBASE-14802
|
||||
// If we have not yet notified that we are processing a dead server, we should do now.
|
||||
if (!notifiedDeadServer) {
|
||||
services.getServerManager().getDeadServers().notifyServer(serverName);
|
||||
notifiedDeadServer = true;
|
||||
}
|
||||
|
||||
try {
|
||||
switch (state) {
|
||||
case SERVER_CRASH_START:
|
||||
|
|
|
@ -17,13 +17,19 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.master;
|
||||
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
import org.apache.hadoop.hbase.util.ManualEnvironmentEdge;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.Assert;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
|
@ -35,24 +41,39 @@ import static org.junit.Assert.assertTrue;
|
|||
|
||||
@Category({MasterTests.class, MediumTests.class})
|
||||
public class TestDeadServer {
|
||||
private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
|
||||
|
||||
final ServerName hostname123 = ServerName.valueOf("127.0.0.1", 123, 3L);
|
||||
final ServerName hostname123_2 = ServerName.valueOf("127.0.0.1", 123, 4L);
|
||||
final ServerName hostname1234 = ServerName.valueOf("127.0.0.2", 1234, 4L);
|
||||
final ServerName hostname12345 = ServerName.valueOf("127.0.0.2", 12345, 4L);
|
||||
|
||||
@BeforeClass
|
||||
public static void setupBeforeClass() throws Exception {
|
||||
TEST_UTIL.startMiniCluster();
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void tearDownAfterClass() throws Exception {
|
||||
TEST_UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
@Test public void testIsDead() {
|
||||
DeadServer ds = new DeadServer();
|
||||
ds.add(hostname123);
|
||||
ds.notifyServer(hostname123);
|
||||
assertTrue(ds.areDeadServersInProgress());
|
||||
ds.finish(hostname123);
|
||||
assertFalse(ds.areDeadServersInProgress());
|
||||
|
||||
ds.add(hostname1234);
|
||||
ds.notifyServer(hostname1234);
|
||||
assertTrue(ds.areDeadServersInProgress());
|
||||
ds.finish(hostname1234);
|
||||
assertFalse(ds.areDeadServersInProgress());
|
||||
|
||||
ds.add(hostname12345);
|
||||
ds.notifyServer(hostname12345);
|
||||
assertTrue(ds.areDeadServersInProgress());
|
||||
ds.finish(hostname12345);
|
||||
assertFalse(ds.areDeadServersInProgress());
|
||||
|
@ -75,6 +96,16 @@ public class TestDeadServer {
|
|||
assertFalse(ds.cleanPreviousInstance(deadServerHostComingAlive));
|
||||
}
|
||||
|
||||
@Test(timeout = 15000)
|
||||
public void testCrashProcedureReplay() {
|
||||
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
|
||||
ProcedureExecutor pExecutor = master.getMasterProcedureExecutor();
|
||||
ServerCrashProcedure proc = new ServerCrashProcedure(hostname123, false, false);
|
||||
|
||||
ProcedureTestingUtility.submitAndWait(pExecutor, proc);
|
||||
|
||||
assertFalse(master.getServerManager().getDeadServers().areDeadServersInProgress());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSortExtract(){
|
||||
|
|
Loading…
Reference in New Issue