HBASE-21440 Assign procedure on the crashed server is not properly interrupted

This commit is contained in:
Ankit Singhal 2018-11-14 22:31:53 -08:00 committed by stack
parent f8213a719f
commit d0c2e60e36
7 changed files with 30 additions and 51 deletions

View File

@ -237,7 +237,7 @@ public abstract class RemoteProcedureDispatcher<TEnv, TRemote extends Comparable
/**
* Called when the executeProcedure call is failed.
*/
void remoteCallFailed(TEnv env, TRemote remote, IOException exception);
boolean remoteCallFailed(TEnv env, TRemote remote, IOException exception);
/**
* Called when RS tells the remote procedure is succeeded through the

View File

@ -34,6 +34,7 @@ import org.apache.hadoop.hbase.master.TableStateManager;
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation;
import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
@ -388,6 +389,17 @@ public class AssignProcedure extends RegionTransitionProcedure {
@Override
protected boolean remoteCallFailed(final MasterProcedureEnv env, final RegionStateNode regionNode,
final IOException exception) {
RegionTransitionState tState = getTransitionState();
if (tState == RegionTransitionState.REGION_TRANSITION_FINISH
&& exception instanceof ServerCrashException) {
// if we found that AssignProcedure is at this stage, then ServerCerash handling may/may not
// have any effect
// depending upon the race between handling of the failure and execution at
// REGION_TRANSITION_FINISH state
LOG.warn("Assign Procedure is at state:" + tState
+ ", so Handling of Server Crash may not have any affect");
return false;
}
handleFailure(env, regionNode);
return true;
}

View File

@ -42,7 +42,6 @@ import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.YouAreDeadException;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.TableState;
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
@ -64,7 +63,6 @@ import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
@ -88,7 +86,6 @@ import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
@ -1874,42 +1871,6 @@ public class AssignmentManager implements ServerListener {
}
}
/**
* Handle RIT of meta region against crashed server.
* Only used when ServerCrashProcedure is not enabled.
* See handleRIT in ServerCrashProcedure for similar function.
*
* @param serverName Server that has already crashed
*/
public void handleMetaRITOnCrashedServer(ServerName serverName) {
RegionInfo hri = RegionReplicaUtil
.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
RegionInfo.DEFAULT_REPLICA_ID);
RegionState regionStateNode = getRegionStates().getRegionState(hri);
if (regionStateNode == null) {
LOG.warn("RegionStateNode is null for " + hri);
return;
}
ServerName rsnServerName = regionStateNode.getServerName();
if (rsnServerName != null && !rsnServerName.equals(serverName)) {
return;
} else if (rsnServerName == null) {
LOG.warn("Empty ServerName in RegionStateNode; proceeding anyways in case latched " +
"RecoverMetaProcedure so meta latch gets cleaned up.");
}
// meta has been assigned to crashed server.
LOG.info("Meta assigned to crashed " + serverName + "; reassigning...");
// Handle failure and wake event
RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
// Do not need to consider for REGION_TRANSITION_QUEUE step
if (rtp != null && rtp.isMeta() &&
rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
LOG.debug("Failing " + rtp.toString());
rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
new ServerCrashException(rtp.getProcId(), serverName));
}
}
@VisibleForTesting
MasterServices getMaster() {
return master;

View File

@ -204,7 +204,7 @@ public abstract class RegionTransitionProcedure
this.transitionState = state;
}
RegionTransitionState getTransitionState() {
protected RegionTransitionState getTransitionState() {
return transitionState;
}
@ -238,7 +238,7 @@ public abstract class RegionTransitionProcedure
RegionStateNode regionNode, IOException exception);
@Override
public synchronized void remoteCallFailed(final MasterProcedureEnv env,
public synchronized boolean remoteCallFailed(final MasterProcedureEnv env,
final ServerName serverName, final IOException exception) {
final RegionStateNode regionNode = getRegionState(env);
LOG.warn("Remote call failed {} {}", this, regionNode.toShortString(), exception);
@ -247,8 +247,10 @@ public abstract class RegionTransitionProcedure
// Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
// this method. Just get out of this current processing quickly.
regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
return true;
}
// else leave the procedure in suspended state; it is waiting on another call to this callback
return false;
}
/**

View File

@ -419,13 +419,15 @@ public class ServerCrashProcedure
if (sce == null) {
sce = new ServerCrashException(getProcId(), getServerName());
}
rtp.remoteCallFailed(env, this.serverName, sce);
// If an assign, remove from passed-in list of regions so we subsequently do not create
// a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate
// and assign to a server other than the crashed one; no need to create new assign.
// If an unassign, do not return this region; the above cancel will wake up the unassign and
// it will complete. Done.
it.remove();
if(rtp.remoteCallFailed(env, this.serverName, sce)) {
// If an assign, remove from passed-in list of regions so we subsequently do not create
// a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate
// and assign to a server other than the crashed one; no need to create new assign.
// If an unassign, do not return this region; the above cancel will wake up the unassign and
// it will complete. Done.
it.remove();
}
}
return toAssign;
}

View File

@ -141,9 +141,10 @@ public class RefreshPeerProcedure extends Procedure<MasterProcedureEnv>
}
@Override
public synchronized void remoteCallFailed(MasterProcedureEnv env, ServerName remote,
public synchronized boolean remoteCallFailed(MasterProcedureEnv env, ServerName remote,
IOException exception) {
complete(env, exception);
return true;
}
@Override

View File

@ -108,11 +108,12 @@ public class TestAssignProcedure {
}
@Override
public void remoteCallFailed(final MasterProcedureEnv env,
public boolean remoteCallFailed(final MasterProcedureEnv env,
final ServerName serverName, final IOException exception) {
// Just skip this remoteCallFailed. Its too hard to mock. Assert it is called though.
// Happens after the code we are testing has been called.
this.remoteCallFailedWasCalled.set(true);
return true;
}
};