diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index e2f0b6b3092..06d6c8b7279 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -555,17 +555,15 @@ public class ServerManager { } /* - * Expire the passed server. Add it to list of dead servers and queue a shutdown processing. - * @return True if we expired passed serverName else false if we failed to schedule - * an expire (and attendant ServerCrashProcedure -- some clients are dependent on - * server crash procedure being queued and need to know if has not been queued). + * Expire the passed server. Add it to list of dead servers and queue a + * shutdown processing. */ - public synchronized boolean expireServer(final ServerName serverName) { + public synchronized void expireServer(final ServerName serverName) { if (serverName.equals(master.getServerName())) { if (!(master.isAborted() || master.isStopped())) { master.stop("We lost our znode?"); } - return false; + return; } if (!master.isServerCrashProcessingEnabled()) { LOG.info("Master doesn't enable ServerShutdownHandler during initialization, " @@ -575,13 +573,13 @@ public class ServerManager { // the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287 master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName); this.queuedDeadServers.add(serverName); - return false; + return; } if (this.deadservers.isDeadServer(serverName)) { // TODO: Can this happen? It shouldn't be online in this case? LOG.warn("Expiration of " + serverName + " but server shutdown already in progress"); - return false; + return; } moveFromOnlineToDeadServers(serverName); @@ -593,7 +591,7 @@ public class ServerManager { if (this.onlineServers.isEmpty()) { master.stop("Cluster shutdown set; onlineServer=0"); } - return false; + return; } LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName()); master.getAssignmentManager().submitServerCrash(serverName, true); @@ -604,7 +602,6 @@ public class ServerManager { listener.serverRemoved(serverName); } } - return true; } @VisibleForTesting diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java index 7e98675a0b9..b459cfe43b0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java @@ -21,17 +21,12 @@ import org.apache.hadoop.hbase.HBaseIOException; import org.apache.yetus.audience.InterfaceAudience; /** - * Used internally signaling failed queue of a remote procedure operation. - * Usually happens because no such remote server; it is being processed as crashed so it is not - * online at time of RPC. Otherwise, something unexpected happened. + * Used internally signaling failed queue of a remote procedure + * operation. */ @SuppressWarnings("serial") @InterfaceAudience.Private public class FailedRemoteDispatchException extends HBaseIOException { - public FailedRemoteDispatchException() { - super(); - } - public FailedRemoteDispatchException(String msg) { super(msg); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java index d427c58e608..6c63cb83be3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java @@ -182,8 +182,10 @@ public abstract class RegionTransitionProcedure public void remoteCallFailed(final MasterProcedureEnv env, final ServerName serverName, final IOException exception) { final RegionStateNode regionNode = getRegionState(env); - LOG.warn("Remote call failed {}; rit={}, exception={}", this, regionNode.getState(), - exception.toString()); + String msg = exception.getMessage() == null? exception.getClass().getSimpleName(): + exception.getMessage(); + LOG.warn("Remote call failed " + this + "; " + regionNode.toShortString() + + "; exception=" + msg); if (remoteCallFailed(env, regionNode, exception)) { // NOTE: This call to wakeEvent puts this Procedure back on the scheduler. // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond @@ -218,14 +220,9 @@ public abstract class RegionTransitionProcedure // backtrack on stuff like the 'suspend' done above -- tricky as the 'wake' requests us -- and // ditto up in the caller; it needs to undo state changes. Inside in remoteCallFailed, it does // wake to undo the above suspend. - // - // We fail the addOperationToNode usually because there is no such remote server (it has - // crashed and we are currently processing it or something went badly wrong and we have a - // bad server). if (!env.getRemoteDispatcher().addOperationToNode(targetServer, this)) { - remoteCallFailed(env, targetServer, targetServer == null? - new FailedRemoteDispatchException(): - new FailedRemoteDispatchException(targetServer.toShortString())); + remoteCallFailed(env, targetServer, + new FailedRemoteDispatchException(this + " to " + targetServer)); return false; } return true; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java index fd3d78d475e..3454d96487f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java @@ -249,12 +249,17 @@ public class UnassignProcedure extends RegionTransitionProcedure { final IOException exception) { // TODO: Is there on-going rpc to cleanup? if (exception instanceof ServerCrashException) { - // This exception comes from ServerCrashProcedure after it is done with log splitting. - // SCP found this region as a Region-In-Transition (RIT). Its call into here says it is ok to - // let this procedure go on to a complete close now. This will release lock on this region so - // subsequent action on region can succeed; e.g. the assign that follows this unassign when - // a move (w/o wait on SCP the assign could run w/o logs being split so data loss). - reportTransitionCLOSED(env, regionNode); + // This exception comes from ServerCrashProcedure after log splitting. + // SCP found this region as a RIT. Its call into here says it is ok to let this procedure go + // on to a complete close now. This will release lock on this region so subsequent action on + // region can succeed; e.g. the assign that follows this unassign when a move (w/o wait on SCP + // the assign could run w/o logs being split so data loss). + try { + reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM); + } catch (UnexpectedStateException e) { + // Should never happen. + throw new RuntimeException(e); + } } else if (exception instanceof RegionServerAbortedException || exception instanceof RegionServerStoppedException || exception instanceof ServerNotRunningYetException) { @@ -268,33 +273,17 @@ public class UnassignProcedure extends RegionTransitionProcedure { exception); setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH); } else { - LOG.warn("Expiring server {}; rit={}, exception={}", this, regionNode.getState(), - exception.toString()); - if (env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation())) { - // Return false so this procedure stays in suspended state. It will be woken up by - // ServerCrashProcedure when it notices this RIT and calls this method again but with - // a SCPException -- see above. - // TODO: Add a SCP as a new subprocedure that we now come to depend on. - return false; - } else { - LOG.warn("Failed expire of {}; presumed CRASHED; moving region to CLOSED state", - regionNode.getRegionLocation()); - reportTransitionCLOSED(env, regionNode); - } + LOG.warn("Expiring server " + this + "; " + regionNode.toShortString() + + ", exception=" + exception); + env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation()); + // Return false so this procedure stays in suspended state. It will be woken up by a + // ServerCrashProcedure when it notices this RIT. + // TODO: Add a SCP as a new subprocedure that we now come to depend on. + return false; } return true; } - private void reportTransitionCLOSED(final MasterProcedureEnv env, - final RegionStateNode regionNode) { - try { - reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM); - } catch (UnexpectedStateException e) { - // Should never happen. - throw new RuntimeException(e); - } - } - @Override public void toStringClassDetails(StringBuilder sb) { super.toStringClassDetails(sb); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java index c5eb9ec5a0f..5003d51e432 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java @@ -722,7 +722,7 @@ public class TestAssignmentManager { } private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor { - public static final int TYPES_OF_FAILURE = 7; + public static final int TYPES_OF_FAILURE = 6; private int invocations; @Override @@ -734,14 +734,6 @@ public class TestAssignmentManager { case 2: throw new RegionServerStoppedException("Fake!"); case 3: throw new ServerNotRunningYetException("Fake!"); case 4: - // We will expire the server that we failed to rpc against. - throw new FailedRemoteDispatchException("Fake!"); - case 5: - // Mark this regionserver as already expiring so we go different code route; i.e. we - // FAIL to expire the remote server and presume ok to move region to CLOSED. HBASE-20137. - TestAssignmentManager.this.master.getServerManager().expireServer(server); - throw new FailedRemoteDispatchException("Fake!"); - case 6: LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout"); executor.schedule(new Runnable() { @Override