Revert "HBASE-20137 TestRSGroups is flakey"

Revert. Fix is not right.

This reverts commit 6d1740d498.
This commit is contained in:
Michael Stack 2018-03-07 09:24:09 -08:00
parent 6b77786dfc
commit 37d91cdfbb
5 changed files with 34 additions and 64 deletions

View File

@ -555,17 +555,15 @@ public class ServerManager {
} }
/* /*
* Expire the passed server. Add it to list of dead servers and queue a shutdown processing. * Expire the passed server. Add it to list of dead servers and queue a
* @return True if we expired passed <code>serverName</code> else false if we failed to schedule * shutdown processing.
* an expire (and attendant ServerCrashProcedure -- some clients are dependent on
* server crash procedure being queued and need to know if has not been queued).
*/ */
public synchronized boolean expireServer(final ServerName serverName) { public synchronized void expireServer(final ServerName serverName) {
if (serverName.equals(master.getServerName())) { if (serverName.equals(master.getServerName())) {
if (!(master.isAborted() || master.isStopped())) { if (!(master.isAborted() || master.isStopped())) {
master.stop("We lost our znode?"); master.stop("We lost our znode?");
} }
return false; return;
} }
if (!master.isServerCrashProcessingEnabled()) { if (!master.isServerCrashProcessingEnabled()) {
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, " LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
@ -575,13 +573,13 @@ public class ServerManager {
// the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287 // the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName); master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
this.queuedDeadServers.add(serverName); this.queuedDeadServers.add(serverName);
return false; return;
} }
if (this.deadservers.isDeadServer(serverName)) { if (this.deadservers.isDeadServer(serverName)) {
// TODO: Can this happen? It shouldn't be online in this case? // TODO: Can this happen? It shouldn't be online in this case?
LOG.warn("Expiration of " + serverName + LOG.warn("Expiration of " + serverName +
" but server shutdown already in progress"); " but server shutdown already in progress");
return false; return;
} }
moveFromOnlineToDeadServers(serverName); moveFromOnlineToDeadServers(serverName);
@ -593,7 +591,7 @@ public class ServerManager {
if (this.onlineServers.isEmpty()) { if (this.onlineServers.isEmpty()) {
master.stop("Cluster shutdown set; onlineServer=0"); master.stop("Cluster shutdown set; onlineServer=0");
} }
return false; return;
} }
LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName()); LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName());
master.getAssignmentManager().submitServerCrash(serverName, true); master.getAssignmentManager().submitServerCrash(serverName, true);
@ -604,7 +602,6 @@ public class ServerManager {
listener.serverRemoved(serverName); listener.serverRemoved(serverName);
} }
} }
return true;
} }
@VisibleForTesting @VisibleForTesting

View File

@ -21,17 +21,12 @@ import org.apache.hadoop.hbase.HBaseIOException;
import org.apache.yetus.audience.InterfaceAudience; import org.apache.yetus.audience.InterfaceAudience;
/** /**
* Used internally signaling failed queue of a remote procedure operation. * Used internally signaling failed queue of a remote procedure
* Usually happens because no such remote server; it is being processed as crashed so it is not * operation.
* online at time of RPC. Otherwise, something unexpected happened.
*/ */
@SuppressWarnings("serial") @SuppressWarnings("serial")
@InterfaceAudience.Private @InterfaceAudience.Private
public class FailedRemoteDispatchException extends HBaseIOException { public class FailedRemoteDispatchException extends HBaseIOException {
public FailedRemoteDispatchException() {
super();
}
public FailedRemoteDispatchException(String msg) { public FailedRemoteDispatchException(String msg) {
super(msg); super(msg);
} }

View File

@ -177,8 +177,10 @@ public abstract class RegionTransitionProcedure
public void remoteCallFailed(final MasterProcedureEnv env, public void remoteCallFailed(final MasterProcedureEnv env,
final ServerName serverName, final IOException exception) { final ServerName serverName, final IOException exception) {
final RegionStateNode regionNode = getRegionState(env); final RegionStateNode regionNode = getRegionState(env);
LOG.warn("Remote call failed {}; rit={}, exception={}", this, regionNode.getState(), String msg = exception.getMessage() == null? exception.getClass().getSimpleName():
exception.toString()); exception.getMessage();
LOG.warn("Remote call failed " + this + "; " + regionNode.toShortString() +
"; exception=" + msg);
if (remoteCallFailed(env, regionNode, exception)) { if (remoteCallFailed(env, regionNode, exception)) {
// NOTE: This call to wakeEvent puts this Procedure back on the scheduler. // NOTE: This call to wakeEvent puts this Procedure back on the scheduler.
// Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
@ -213,14 +215,9 @@ public abstract class RegionTransitionProcedure
// backtrack on stuff like the 'suspend' done above -- tricky as the 'wake' requests us -- and // backtrack on stuff like the 'suspend' done above -- tricky as the 'wake' requests us -- and
// ditto up in the caller; it needs to undo state changes. Inside in remoteCallFailed, it does // ditto up in the caller; it needs to undo state changes. Inside in remoteCallFailed, it does
// wake to undo the above suspend. // wake to undo the above suspend.
//
// We fail the addOperationToNode usually because there is no such remote server (it has
// crashed and we are currently processing it or something went badly wrong and we have a
// bad server).
if (!env.getRemoteDispatcher().addOperationToNode(targetServer, this)) { if (!env.getRemoteDispatcher().addOperationToNode(targetServer, this)) {
remoteCallFailed(env, targetServer, targetServer == null? remoteCallFailed(env, targetServer,
new FailedRemoteDispatchException(): new FailedRemoteDispatchException(this + " to " + targetServer));
new FailedRemoteDispatchException(targetServer.toShortString()));
return false; return false;
} }
return true; return true;

View File

@ -249,12 +249,17 @@ public class UnassignProcedure extends RegionTransitionProcedure {
final IOException exception) { final IOException exception) {
// TODO: Is there on-going rpc to cleanup? // TODO: Is there on-going rpc to cleanup?
if (exception instanceof ServerCrashException) { if (exception instanceof ServerCrashException) {
// This exception comes from ServerCrashProcedure after it is done with log splitting. // This exception comes from ServerCrashProcedure after log splitting.
// SCP found this region as a Region-In-Transition (RIT). Its call into here says it is ok to // SCP found this region as a RIT. Its call into here says it is ok to let this procedure go
// let this procedure go on to a complete close now. This will release lock on this region so // on to a complete close now. This will release lock on this region so subsequent action on
// subsequent action on region can succeed; e.g. the assign that follows this unassign when // region can succeed; e.g. the assign that follows this unassign when a move (w/o wait on SCP
// a move (w/o wait on SCP the assign could run w/o logs being split so data loss). // the assign could run w/o logs being split so data loss).
reportTransitionCLOSED(env, regionNode); try {
reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM);
} catch (UnexpectedStateException e) {
// Should never happen.
throw new RuntimeException(e);
}
} else if (exception instanceof RegionServerAbortedException || } else if (exception instanceof RegionServerAbortedException ||
exception instanceof RegionServerStoppedException || exception instanceof RegionServerStoppedException ||
exception instanceof ServerNotRunningYetException) { exception instanceof ServerNotRunningYetException) {
@ -268,33 +273,17 @@ public class UnassignProcedure extends RegionTransitionProcedure {
exception); exception);
setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH); setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH);
} else { } else {
LOG.warn("Expiring server {}; rit={}, exception={}", this, regionNode.getState(), LOG.warn("Expiring server " + this + "; " + regionNode.toShortString() +
exception.toString()); ", exception=" + exception);
if (env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation())) { env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation());
// Return false so this procedure stays in suspended state. It will be woken up by // Return false so this procedure stays in suspended state. It will be woken up by a
// ServerCrashProcedure when it notices this RIT and calls this method again but with // ServerCrashProcedure when it notices this RIT.
// a SCPException -- see above.
// TODO: Add a SCP as a new subprocedure that we now come to depend on. // TODO: Add a SCP as a new subprocedure that we now come to depend on.
return false; return false;
} else {
LOG.warn("Failed expire of {}; presumed CRASHED; moving region to CLOSED state",
regionNode.getRegionLocation());
reportTransitionCLOSED(env, regionNode);
}
} }
return true; return true;
} }
private void reportTransitionCLOSED(final MasterProcedureEnv env,
final RegionStateNode regionNode) {
try {
reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM);
} catch (UnexpectedStateException e) {
// Should never happen.
throw new RuntimeException(e);
}
}
@Override @Override
public void toStringClassDetails(StringBuilder sb) { public void toStringClassDetails(StringBuilder sb) {
super.toStringClassDetails(sb); super.toStringClassDetails(sb);

View File

@ -712,7 +712,7 @@ public class TestAssignmentManager {
} }
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor { private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
public static final int TYPES_OF_FAILURE = 7; public static final int TYPES_OF_FAILURE = 6;
private int invocations; private int invocations;
@Override @Override
@ -724,14 +724,6 @@ public class TestAssignmentManager {
case 2: throw new RegionServerStoppedException("Fake!"); case 2: throw new RegionServerStoppedException("Fake!");
case 3: throw new ServerNotRunningYetException("Fake!"); case 3: throw new ServerNotRunningYetException("Fake!");
case 4: case 4:
// We will expire the server that we failed to rpc against.
throw new FailedRemoteDispatchException("Fake!");
case 5:
// Mark this regionserver as already expiring so we go different code route; i.e. we
// FAIL to expire the remote server and presume ok to move region to CLOSED. HBASE-20137.
TestAssignmentManager.this.master.getServerManager().expireServer(server);
throw new FailedRemoteDispatchException("Fake!");
case 6:
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout"); LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
executor.schedule(new Runnable() { executor.schedule(new Runnable() {
@Override @Override