Revert "HBASE-20137 TestRSGroups is flakey"
Revert. Fix is not right.
This reverts commit 6d1740d498
.
This commit is contained in:
parent
6b77786dfc
commit
37d91cdfbb
|
@ -555,17 +555,15 @@ public class ServerManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Expire the passed server. Add it to list of dead servers and queue a shutdown processing.
|
* Expire the passed server. Add it to list of dead servers and queue a
|
||||||
* @return True if we expired passed <code>serverName</code> else false if we failed to schedule
|
* shutdown processing.
|
||||||
* an expire (and attendant ServerCrashProcedure -- some clients are dependent on
|
|
||||||
* server crash procedure being queued and need to know if has not been queued).
|
|
||||||
*/
|
*/
|
||||||
public synchronized boolean expireServer(final ServerName serverName) {
|
public synchronized void expireServer(final ServerName serverName) {
|
||||||
if (serverName.equals(master.getServerName())) {
|
if (serverName.equals(master.getServerName())) {
|
||||||
if (!(master.isAborted() || master.isStopped())) {
|
if (!(master.isAborted() || master.isStopped())) {
|
||||||
master.stop("We lost our znode?");
|
master.stop("We lost our znode?");
|
||||||
}
|
}
|
||||||
return false;
|
return;
|
||||||
}
|
}
|
||||||
if (!master.isServerCrashProcessingEnabled()) {
|
if (!master.isServerCrashProcessingEnabled()) {
|
||||||
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
|
LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
|
||||||
|
@ -575,13 +573,13 @@ public class ServerManager {
|
||||||
// the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
|
// the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
|
||||||
master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
|
master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
|
||||||
this.queuedDeadServers.add(serverName);
|
this.queuedDeadServers.add(serverName);
|
||||||
return false;
|
return;
|
||||||
}
|
}
|
||||||
if (this.deadservers.isDeadServer(serverName)) {
|
if (this.deadservers.isDeadServer(serverName)) {
|
||||||
// TODO: Can this happen? It shouldn't be online in this case?
|
// TODO: Can this happen? It shouldn't be online in this case?
|
||||||
LOG.warn("Expiration of " + serverName +
|
LOG.warn("Expiration of " + serverName +
|
||||||
" but server shutdown already in progress");
|
" but server shutdown already in progress");
|
||||||
return false;
|
return;
|
||||||
}
|
}
|
||||||
moveFromOnlineToDeadServers(serverName);
|
moveFromOnlineToDeadServers(serverName);
|
||||||
|
|
||||||
|
@ -593,7 +591,7 @@ public class ServerManager {
|
||||||
if (this.onlineServers.isEmpty()) {
|
if (this.onlineServers.isEmpty()) {
|
||||||
master.stop("Cluster shutdown set; onlineServer=0");
|
master.stop("Cluster shutdown set; onlineServer=0");
|
||||||
}
|
}
|
||||||
return false;
|
return;
|
||||||
}
|
}
|
||||||
LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName());
|
LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName());
|
||||||
master.getAssignmentManager().submitServerCrash(serverName, true);
|
master.getAssignmentManager().submitServerCrash(serverName, true);
|
||||||
|
@ -604,7 +602,6 @@ public class ServerManager {
|
||||||
listener.serverRemoved(serverName);
|
listener.serverRemoved(serverName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
|
|
|
@ -21,17 +21,12 @@ import org.apache.hadoop.hbase.HBaseIOException;
|
||||||
import org.apache.yetus.audience.InterfaceAudience;
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Used internally signaling failed queue of a remote procedure operation.
|
* Used internally signaling failed queue of a remote procedure
|
||||||
* Usually happens because no such remote server; it is being processed as crashed so it is not
|
* operation.
|
||||||
* online at time of RPC. Otherwise, something unexpected happened.
|
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("serial")
|
@SuppressWarnings("serial")
|
||||||
@InterfaceAudience.Private
|
@InterfaceAudience.Private
|
||||||
public class FailedRemoteDispatchException extends HBaseIOException {
|
public class FailedRemoteDispatchException extends HBaseIOException {
|
||||||
public FailedRemoteDispatchException() {
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
public FailedRemoteDispatchException(String msg) {
|
public FailedRemoteDispatchException(String msg) {
|
||||||
super(msg);
|
super(msg);
|
||||||
}
|
}
|
||||||
|
|
|
@ -177,8 +177,10 @@ public abstract class RegionTransitionProcedure
|
||||||
public void remoteCallFailed(final MasterProcedureEnv env,
|
public void remoteCallFailed(final MasterProcedureEnv env,
|
||||||
final ServerName serverName, final IOException exception) {
|
final ServerName serverName, final IOException exception) {
|
||||||
final RegionStateNode regionNode = getRegionState(env);
|
final RegionStateNode regionNode = getRegionState(env);
|
||||||
LOG.warn("Remote call failed {}; rit={}, exception={}", this, regionNode.getState(),
|
String msg = exception.getMessage() == null? exception.getClass().getSimpleName():
|
||||||
exception.toString());
|
exception.getMessage();
|
||||||
|
LOG.warn("Remote call failed " + this + "; " + regionNode.toShortString() +
|
||||||
|
"; exception=" + msg);
|
||||||
if (remoteCallFailed(env, regionNode, exception)) {
|
if (remoteCallFailed(env, regionNode, exception)) {
|
||||||
// NOTE: This call to wakeEvent puts this Procedure back on the scheduler.
|
// NOTE: This call to wakeEvent puts this Procedure back on the scheduler.
|
||||||
// Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
|
// Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
|
||||||
|
@ -213,14 +215,9 @@ public abstract class RegionTransitionProcedure
|
||||||
// backtrack on stuff like the 'suspend' done above -- tricky as the 'wake' requests us -- and
|
// backtrack on stuff like the 'suspend' done above -- tricky as the 'wake' requests us -- and
|
||||||
// ditto up in the caller; it needs to undo state changes. Inside in remoteCallFailed, it does
|
// ditto up in the caller; it needs to undo state changes. Inside in remoteCallFailed, it does
|
||||||
// wake to undo the above suspend.
|
// wake to undo the above suspend.
|
||||||
//
|
|
||||||
// We fail the addOperationToNode usually because there is no such remote server (it has
|
|
||||||
// crashed and we are currently processing it or something went badly wrong and we have a
|
|
||||||
// bad server).
|
|
||||||
if (!env.getRemoteDispatcher().addOperationToNode(targetServer, this)) {
|
if (!env.getRemoteDispatcher().addOperationToNode(targetServer, this)) {
|
||||||
remoteCallFailed(env, targetServer, targetServer == null?
|
remoteCallFailed(env, targetServer,
|
||||||
new FailedRemoteDispatchException():
|
new FailedRemoteDispatchException(this + " to " + targetServer));
|
||||||
new FailedRemoteDispatchException(targetServer.toShortString()));
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -249,12 +249,17 @@ public class UnassignProcedure extends RegionTransitionProcedure {
|
||||||
final IOException exception) {
|
final IOException exception) {
|
||||||
// TODO: Is there on-going rpc to cleanup?
|
// TODO: Is there on-going rpc to cleanup?
|
||||||
if (exception instanceof ServerCrashException) {
|
if (exception instanceof ServerCrashException) {
|
||||||
// This exception comes from ServerCrashProcedure after it is done with log splitting.
|
// This exception comes from ServerCrashProcedure after log splitting.
|
||||||
// SCP found this region as a Region-In-Transition (RIT). Its call into here says it is ok to
|
// SCP found this region as a RIT. Its call into here says it is ok to let this procedure go
|
||||||
// let this procedure go on to a complete close now. This will release lock on this region so
|
// on to a complete close now. This will release lock on this region so subsequent action on
|
||||||
// subsequent action on region can succeed; e.g. the assign that follows this unassign when
|
// region can succeed; e.g. the assign that follows this unassign when a move (w/o wait on SCP
|
||||||
// a move (w/o wait on SCP the assign could run w/o logs being split so data loss).
|
// the assign could run w/o logs being split so data loss).
|
||||||
reportTransitionCLOSED(env, regionNode);
|
try {
|
||||||
|
reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM);
|
||||||
|
} catch (UnexpectedStateException e) {
|
||||||
|
// Should never happen.
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
} else if (exception instanceof RegionServerAbortedException ||
|
} else if (exception instanceof RegionServerAbortedException ||
|
||||||
exception instanceof RegionServerStoppedException ||
|
exception instanceof RegionServerStoppedException ||
|
||||||
exception instanceof ServerNotRunningYetException) {
|
exception instanceof ServerNotRunningYetException) {
|
||||||
|
@ -268,33 +273,17 @@ public class UnassignProcedure extends RegionTransitionProcedure {
|
||||||
exception);
|
exception);
|
||||||
setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH);
|
setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH);
|
||||||
} else {
|
} else {
|
||||||
LOG.warn("Expiring server {}; rit={}, exception={}", this, regionNode.getState(),
|
LOG.warn("Expiring server " + this + "; " + regionNode.toShortString() +
|
||||||
exception.toString());
|
", exception=" + exception);
|
||||||
if (env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation())) {
|
env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation());
|
||||||
// Return false so this procedure stays in suspended state. It will be woken up by
|
// Return false so this procedure stays in suspended state. It will be woken up by a
|
||||||
// ServerCrashProcedure when it notices this RIT and calls this method again but with
|
// ServerCrashProcedure when it notices this RIT.
|
||||||
// a SCPException -- see above.
|
|
||||||
// TODO: Add a SCP as a new subprocedure that we now come to depend on.
|
// TODO: Add a SCP as a new subprocedure that we now come to depend on.
|
||||||
return false;
|
return false;
|
||||||
} else {
|
|
||||||
LOG.warn("Failed expire of {}; presumed CRASHED; moving region to CLOSED state",
|
|
||||||
regionNode.getRegionLocation());
|
|
||||||
reportTransitionCLOSED(env, regionNode);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void reportTransitionCLOSED(final MasterProcedureEnv env,
|
|
||||||
final RegionStateNode regionNode) {
|
|
||||||
try {
|
|
||||||
reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM);
|
|
||||||
} catch (UnexpectedStateException e) {
|
|
||||||
// Should never happen.
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void toStringClassDetails(StringBuilder sb) {
|
public void toStringClassDetails(StringBuilder sb) {
|
||||||
super.toStringClassDetails(sb);
|
super.toStringClassDetails(sb);
|
||||||
|
|
|
@ -712,7 +712,7 @@ public class TestAssignmentManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
|
private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
|
||||||
public static final int TYPES_OF_FAILURE = 7;
|
public static final int TYPES_OF_FAILURE = 6;
|
||||||
private int invocations;
|
private int invocations;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -724,14 +724,6 @@ public class TestAssignmentManager {
|
||||||
case 2: throw new RegionServerStoppedException("Fake!");
|
case 2: throw new RegionServerStoppedException("Fake!");
|
||||||
case 3: throw new ServerNotRunningYetException("Fake!");
|
case 3: throw new ServerNotRunningYetException("Fake!");
|
||||||
case 4:
|
case 4:
|
||||||
// We will expire the server that we failed to rpc against.
|
|
||||||
throw new FailedRemoteDispatchException("Fake!");
|
|
||||||
case 5:
|
|
||||||
// Mark this regionserver as already expiring so we go different code route; i.e. we
|
|
||||||
// FAIL to expire the remote server and presume ok to move region to CLOSED. HBASE-20137.
|
|
||||||
TestAssignmentManager.this.master.getServerManager().expireServer(server);
|
|
||||||
throw new FailedRemoteDispatchException("Fake!");
|
|
||||||
case 6:
|
|
||||||
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
|
LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
|
||||||
executor.schedule(new Runnable() {
|
executor.schedule(new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
|
|
Loading…
Reference in New Issue