HBASE-21440 Assign procedure on the crashed server is not properly interrupted
This commit is contained in:
parent
f8213a719f
commit
d0c2e60e36
|
@ -237,7 +237,7 @@ public abstract class RemoteProcedureDispatcher<TEnv, TRemote extends Comparable
|
|||
/**
|
||||
* Called when the executeProcedure call is failed.
|
||||
*/
|
||||
void remoteCallFailed(TEnv env, TRemote remote, IOException exception);
|
||||
boolean remoteCallFailed(TEnv env, TRemote remote, IOException exception);
|
||||
|
||||
/**
|
||||
* Called when RS tells the remote procedure is succeeded through the
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.hadoop.hbase.master.TableStateManager;
|
|||
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
|
@ -388,6 +389,17 @@ public class AssignProcedure extends RegionTransitionProcedure {
|
|||
@Override
|
||||
protected boolean remoteCallFailed(final MasterProcedureEnv env, final RegionStateNode regionNode,
|
||||
final IOException exception) {
|
||||
RegionTransitionState tState = getTransitionState();
|
||||
if (tState == RegionTransitionState.REGION_TRANSITION_FINISH
|
||||
&& exception instanceof ServerCrashException) {
|
||||
// if we found that AssignProcedure is at this stage, then ServerCerash handling may/may not
|
||||
// have any effect
|
||||
// depending upon the race between handling of the failure and execution at
|
||||
// REGION_TRANSITION_FINISH state
|
||||
LOG.warn("Assign Procedure is at state:" + tState
|
||||
+ ", so Handling of Server Crash may not have any affect");
|
||||
return false;
|
||||
}
|
||||
handleFailure(env, regionNode);
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -42,7 +42,6 @@ import org.apache.hadoop.hbase.TableName;
|
|||
import org.apache.hadoop.hbase.YouAreDeadException;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||
import org.apache.hadoop.hbase.client.Result;
|
||||
import org.apache.hadoop.hbase.client.TableState;
|
||||
import org.apache.hadoop.hbase.exceptions.UnexpectedStateException;
|
||||
|
@ -64,7 +63,6 @@ import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer;
|
|||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
|
||||
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashException;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
|
||||
|
@ -88,7 +86,6 @@ import org.slf4j.LoggerFactory;
|
|||
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionTransitionState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
|
||||
|
@ -1874,42 +1871,6 @@ public class AssignmentManager implements ServerListener {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle RIT of meta region against crashed server.
|
||||
* Only used when ServerCrashProcedure is not enabled.
|
||||
* See handleRIT in ServerCrashProcedure for similar function.
|
||||
*
|
||||
* @param serverName Server that has already crashed
|
||||
*/
|
||||
public void handleMetaRITOnCrashedServer(ServerName serverName) {
|
||||
RegionInfo hri = RegionReplicaUtil
|
||||
.getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO,
|
||||
RegionInfo.DEFAULT_REPLICA_ID);
|
||||
RegionState regionStateNode = getRegionStates().getRegionState(hri);
|
||||
if (regionStateNode == null) {
|
||||
LOG.warn("RegionStateNode is null for " + hri);
|
||||
return;
|
||||
}
|
||||
ServerName rsnServerName = regionStateNode.getServerName();
|
||||
if (rsnServerName != null && !rsnServerName.equals(serverName)) {
|
||||
return;
|
||||
} else if (rsnServerName == null) {
|
||||
LOG.warn("Empty ServerName in RegionStateNode; proceeding anyways in case latched " +
|
||||
"RecoverMetaProcedure so meta latch gets cleaned up.");
|
||||
}
|
||||
// meta has been assigned to crashed server.
|
||||
LOG.info("Meta assigned to crashed " + serverName + "; reassigning...");
|
||||
// Handle failure and wake event
|
||||
RegionTransitionProcedure rtp = getRegionStates().getRegionTransitionProcedure(hri);
|
||||
// Do not need to consider for REGION_TRANSITION_QUEUE step
|
||||
if (rtp != null && rtp.isMeta() &&
|
||||
rtp.getTransitionState() == RegionTransitionState.REGION_TRANSITION_DISPATCH) {
|
||||
LOG.debug("Failing " + rtp.toString());
|
||||
rtp.remoteCallFailed(master.getMasterProcedureExecutor().getEnvironment(), serverName,
|
||||
new ServerCrashException(rtp.getProcId(), serverName));
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
MasterServices getMaster() {
|
||||
return master;
|
||||
|
|
|
@ -204,7 +204,7 @@ public abstract class RegionTransitionProcedure
|
|||
this.transitionState = state;
|
||||
}
|
||||
|
||||
RegionTransitionState getTransitionState() {
|
||||
protected RegionTransitionState getTransitionState() {
|
||||
return transitionState;
|
||||
}
|
||||
|
||||
|
@ -238,7 +238,7 @@ public abstract class RegionTransitionProcedure
|
|||
RegionStateNode regionNode, IOException exception);
|
||||
|
||||
@Override
|
||||
public synchronized void remoteCallFailed(final MasterProcedureEnv env,
|
||||
public synchronized boolean remoteCallFailed(final MasterProcedureEnv env,
|
||||
final ServerName serverName, final IOException exception) {
|
||||
final RegionStateNode regionNode = getRegionState(env);
|
||||
LOG.warn("Remote call failed {} {}", this, regionNode.toShortString(), exception);
|
||||
|
@ -247,8 +247,10 @@ public abstract class RegionTransitionProcedure
|
|||
// Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
|
||||
// this method. Just get out of this current processing quickly.
|
||||
regionNode.getProcedureEvent().wake(env.getProcedureScheduler());
|
||||
return true;
|
||||
}
|
||||
// else leave the procedure in suspended state; it is waiting on another call to this callback
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -419,13 +419,15 @@ public class ServerCrashProcedure
|
|||
if (sce == null) {
|
||||
sce = new ServerCrashException(getProcId(), getServerName());
|
||||
}
|
||||
rtp.remoteCallFailed(env, this.serverName, sce);
|
||||
// If an assign, remove from passed-in list of regions so we subsequently do not create
|
||||
// a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate
|
||||
// and assign to a server other than the crashed one; no need to create new assign.
|
||||
// If an unassign, do not return this region; the above cancel will wake up the unassign and
|
||||
// it will complete. Done.
|
||||
it.remove();
|
||||
if(rtp.remoteCallFailed(env, this.serverName, sce)) {
|
||||
// If an assign, remove from passed-in list of regions so we subsequently do not create
|
||||
// a new assign; the exisitng assign after the call to remoteCallFailed will recalibrate
|
||||
// and assign to a server other than the crashed one; no need to create new assign.
|
||||
// If an unassign, do not return this region; the above cancel will wake up the unassign and
|
||||
// it will complete. Done.
|
||||
it.remove();
|
||||
}
|
||||
|
||||
}
|
||||
return toAssign;
|
||||
}
|
||||
|
|
|
@ -141,9 +141,10 @@ public class RefreshPeerProcedure extends Procedure<MasterProcedureEnv>
|
|||
}
|
||||
|
||||
@Override
|
||||
public synchronized void remoteCallFailed(MasterProcedureEnv env, ServerName remote,
|
||||
public synchronized boolean remoteCallFailed(MasterProcedureEnv env, ServerName remote,
|
||||
IOException exception) {
|
||||
complete(env, exception);
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -108,11 +108,12 @@ public class TestAssignProcedure {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void remoteCallFailed(final MasterProcedureEnv env,
|
||||
public boolean remoteCallFailed(final MasterProcedureEnv env,
|
||||
final ServerName serverName, final IOException exception) {
|
||||
// Just skip this remoteCallFailed. Its too hard to mock. Assert it is called though.
|
||||
// Happens after the code we are testing has been called.
|
||||
this.remoteCallFailedWasCalled.set(true);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue