diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 06d6c8b7279..e2f0b6b3092 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -555,15 +555,17 @@ public class ServerManager { } /* - * Expire the passed server. Add it to list of dead servers and queue a - * shutdown processing. + * Expire the passed server. Add it to list of dead servers and queue a shutdown processing. + * @return True if we expired passed serverName else false if we failed to schedule + * an expire (and attendant ServerCrashProcedure -- some clients are dependent on + * server crash procedure being queued and need to know if has not been queued). */ - public synchronized void expireServer(final ServerName serverName) { + public synchronized boolean expireServer(final ServerName serverName) { if (serverName.equals(master.getServerName())) { if (!(master.isAborted() || master.isStopped())) { master.stop("We lost our znode?"); } - return; + return false; } if (!master.isServerCrashProcessingEnabled()) { LOG.info("Master doesn't enable ServerShutdownHandler during initialization, " @@ -573,13 +575,13 @@ public class ServerManager { // the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287 master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName); this.queuedDeadServers.add(serverName); - return; + return false; } if (this.deadservers.isDeadServer(serverName)) { // TODO: Can this happen? It shouldn't be online in this case? LOG.warn("Expiration of " + serverName + " but server shutdown already in progress"); - return; + return false; } moveFromOnlineToDeadServers(serverName); @@ -591,7 +593,7 @@ public class ServerManager { if (this.onlineServers.isEmpty()) { master.stop("Cluster shutdown set; onlineServer=0"); } - return; + return false; } LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName()); master.getAssignmentManager().submitServerCrash(serverName, true); @@ -602,6 +604,7 @@ public class ServerManager { listener.serverRemoved(serverName); } } + return true; } @VisibleForTesting diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java index b459cfe43b0..7e98675a0b9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java @@ -21,12 +21,17 @@ import org.apache.hadoop.hbase.HBaseIOException; import org.apache.yetus.audience.InterfaceAudience; /** - * Used internally signaling failed queue of a remote procedure - * operation. + * Used internally signaling failed queue of a remote procedure operation. + * Usually happens because no such remote server; it is being processed as crashed so it is not + * online at time of RPC. Otherwise, something unexpected happened. */ @SuppressWarnings("serial") @InterfaceAudience.Private public class FailedRemoteDispatchException extends HBaseIOException { + public FailedRemoteDispatchException() { + super(); + } + public FailedRemoteDispatchException(String msg) { super(msg); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java index 6c63cb83be3..d427c58e608 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java @@ -182,10 +182,8 @@ public abstract class RegionTransitionProcedure public void remoteCallFailed(final MasterProcedureEnv env, final ServerName serverName, final IOException exception) { final RegionStateNode regionNode = getRegionState(env); - String msg = exception.getMessage() == null? exception.getClass().getSimpleName(): - exception.getMessage(); - LOG.warn("Remote call failed " + this + "; " + regionNode.toShortString() + - "; exception=" + msg); + LOG.warn("Remote call failed {}; rit={}, exception={}", this, regionNode.getState(), + exception.toString()); if (remoteCallFailed(env, regionNode, exception)) { // NOTE: This call to wakeEvent puts this Procedure back on the scheduler. // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond @@ -220,9 +218,14 @@ public abstract class RegionTransitionProcedure // backtrack on stuff like the 'suspend' done above -- tricky as the 'wake' requests us -- and // ditto up in the caller; it needs to undo state changes. Inside in remoteCallFailed, it does // wake to undo the above suspend. + // + // We fail the addOperationToNode usually because there is no such remote server (it has + // crashed and we are currently processing it or something went badly wrong and we have a + // bad server). if (!env.getRemoteDispatcher().addOperationToNode(targetServer, this)) { - remoteCallFailed(env, targetServer, - new FailedRemoteDispatchException(this + " to " + targetServer)); + remoteCallFailed(env, targetServer, targetServer == null? + new FailedRemoteDispatchException(): + new FailedRemoteDispatchException(targetServer.toShortString())); return false; } return true; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java index 3454d96487f..fd3d78d475e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java @@ -249,17 +249,12 @@ public class UnassignProcedure extends RegionTransitionProcedure { final IOException exception) { // TODO: Is there on-going rpc to cleanup? if (exception instanceof ServerCrashException) { - // This exception comes from ServerCrashProcedure after log splitting. - // SCP found this region as a RIT. Its call into here says it is ok to let this procedure go - // on to a complete close now. This will release lock on this region so subsequent action on - // region can succeed; e.g. the assign that follows this unassign when a move (w/o wait on SCP - // the assign could run w/o logs being split so data loss). - try { - reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM); - } catch (UnexpectedStateException e) { - // Should never happen. - throw new RuntimeException(e); - } + // This exception comes from ServerCrashProcedure after it is done with log splitting. + // SCP found this region as a Region-In-Transition (RIT). Its call into here says it is ok to + // let this procedure go on to a complete close now. This will release lock on this region so + // subsequent action on region can succeed; e.g. the assign that follows this unassign when + // a move (w/o wait on SCP the assign could run w/o logs being split so data loss). + reportTransitionCLOSED(env, regionNode); } else if (exception instanceof RegionServerAbortedException || exception instanceof RegionServerStoppedException || exception instanceof ServerNotRunningYetException) { @@ -273,17 +268,33 @@ public class UnassignProcedure extends RegionTransitionProcedure { exception); setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH); } else { - LOG.warn("Expiring server " + this + "; " + regionNode.toShortString() + - ", exception=" + exception); - env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation()); - // Return false so this procedure stays in suspended state. It will be woken up by a - // ServerCrashProcedure when it notices this RIT. - // TODO: Add a SCP as a new subprocedure that we now come to depend on. - return false; + LOG.warn("Expiring server {}; rit={}, exception={}", this, regionNode.getState(), + exception.toString()); + if (env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation())) { + // Return false so this procedure stays in suspended state. It will be woken up by + // ServerCrashProcedure when it notices this RIT and calls this method again but with + // a SCPException -- see above. + // TODO: Add a SCP as a new subprocedure that we now come to depend on. + return false; + } else { + LOG.warn("Failed expire of {}; presumed CRASHED; moving region to CLOSED state", + regionNode.getRegionLocation()); + reportTransitionCLOSED(env, regionNode); + } } return true; } + private void reportTransitionCLOSED(final MasterProcedureEnv env, + final RegionStateNode regionNode) { + try { + reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM); + } catch (UnexpectedStateException e) { + // Should never happen. + throw new RuntimeException(e); + } + } + @Override public void toStringClassDetails(StringBuilder sb) { super.toStringClassDetails(sb); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java index 5003d51e432..c5eb9ec5a0f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java @@ -722,7 +722,7 @@ public class TestAssignmentManager { } private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor { - public static final int TYPES_OF_FAILURE = 6; + public static final int TYPES_OF_FAILURE = 7; private int invocations; @Override @@ -734,6 +734,14 @@ public class TestAssignmentManager { case 2: throw new RegionServerStoppedException("Fake!"); case 3: throw new ServerNotRunningYetException("Fake!"); case 4: + // We will expire the server that we failed to rpc against. + throw new FailedRemoteDispatchException("Fake!"); + case 5: + // Mark this regionserver as already expiring so we go different code route; i.e. we + // FAIL to expire the remote server and presume ok to move region to CLOSED. HBASE-20137. + TestAssignmentManager.this.master.getServerManager().expireServer(server); + throw new FailedRemoteDispatchException("Fake!"); + case 6: LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout"); executor.schedule(new Runnable() { @Override