Revert "HBASE-20137 TestRSGroups is flakey"

Revert. Fix is not right. This reverts commit 6d1740d498.
2018-03-07 09:24:09 -08:00 · 2018-03-07 09:24:09 -08:00 · 37d91cdfbb
parent 6b77786dfc
commit 37d91cdfbb
5 changed files with 34 additions and 64 deletions
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
@ -555,17 +555,15 @@ public class ServerManager {
  }

  /*
-   * Expire the passed server. Add it to list of dead servers and queue a shutdown processing.
-   * @return True if we expired passed <code>serverName</code> else false if we failed to schedule
-   * an expire (and attendant ServerCrashProcedure -- some clients are dependent on
-   * server crash procedure being queued and need to know if has not been queued).
+   * Expire the passed server.  Add it to list of dead servers and queue a
+   * shutdown processing.
   */
-  public synchronized boolean expireServer(final ServerName serverName) {
+  public synchronized void expireServer(final ServerName serverName) {
    if (serverName.equals(master.getServerName())) {
      if (!(master.isAborted() || master.isStopped())) {
        master.stop("We lost our znode?");
      }
-      return false;
+      return;
    }
    if (!master.isServerCrashProcessingEnabled()) {
      LOG.info("Master doesn't enable ServerShutdownHandler during initialization, "
@ -575,13 +573,13 @@ public class ServerManager {
      // the SCP is not enable yet and Meta's RIT may be suspend forever. See HBase-19287
      master.getAssignmentManager().handleMetaRITOnCrashedServer(serverName);
      this.queuedDeadServers.add(serverName);
-      return false;
+      return;
    }
    if (this.deadservers.isDeadServer(serverName)) {
      // TODO: Can this happen?  It shouldn't be online in this case?
      LOG.warn("Expiration of " + serverName +
          " but server shutdown already in progress");
-      return false;
+      return;
    }
    moveFromOnlineToDeadServers(serverName);

@ -593,7 +591,7 @@ public class ServerManager {
      if (this.onlineServers.isEmpty()) {
        master.stop("Cluster shutdown set; onlineServer=0");
      }
-      return false;
+      return;
    }
    LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName());
    master.getAssignmentManager().submitServerCrash(serverName, true);
@ -604,7 +602,6 @@ public class ServerManager {
        listener.serverRemoved(serverName);
      }
    }
-    return true;
  }

  @VisibleForTesting
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/FailedRemoteDispatchException.java
@ -21,17 +21,12 @@ import org.apache.hadoop.hbase.HBaseIOException;
 import org.apache.yetus.audience.InterfaceAudience;

 /**
- * Used internally signaling failed queue of a remote procedure operation.
- * Usually happens because no such remote server; it is being processed as crashed so it is not
- * online at time of RPC. Otherwise, something unexpected happened.
+ * Used internally signaling failed queue of a remote procedure
+ * operation.
 */
@SuppressWarnings("serial")
@InterfaceAudience.Private
 public class FailedRemoteDispatchException extends HBaseIOException {
-  public FailedRemoteDispatchException() {
-    super();
-  }
-
  public FailedRemoteDispatchException(String msg) {
    super(msg);
  }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionTransitionProcedure.java
@ -177,8 +177,10 @@ public abstract class RegionTransitionProcedure
  public void remoteCallFailed(final MasterProcedureEnv env,
      final ServerName serverName, final IOException exception) {
    final RegionStateNode regionNode = getRegionState(env);
-    LOG.warn("Remote call failed {}; rit={}, exception={}", this, regionNode.getState(),
-        exception.toString());
+    String msg = exception.getMessage() == null? exception.getClass().getSimpleName():
+      exception.getMessage();
+    LOG.warn("Remote call failed " + this + "; " + regionNode.toShortString() +
+      "; exception=" + msg);
    if (remoteCallFailed(env, regionNode, exception)) {
      // NOTE: This call to wakeEvent puts this Procedure back on the scheduler.
      // Thereafter, another Worker can be in here so DO NOT MESS WITH STATE beyond
@ -213,14 +215,9 @@ public abstract class RegionTransitionProcedure
    // backtrack on stuff like the 'suspend' done above -- tricky as the 'wake' requests us -- and
    // ditto up in the caller; it needs to undo state changes. Inside in remoteCallFailed, it does
    // wake to undo the above suspend.
-    //
-    // We fail the addOperationToNode usually because there is no such remote server (it has
-    // crashed and we are currently processing it or something went badly wrong and we have a
-    // bad server).
    if (!env.getRemoteDispatcher().addOperationToNode(targetServer, this)) {
-      remoteCallFailed(env, targetServer, targetServer == null?
-          new FailedRemoteDispatchException():
-          new FailedRemoteDispatchException(targetServer.toShortString()));
+      remoteCallFailed(env, targetServer,
+          new FailedRemoteDispatchException(this + " to " + targetServer));
      return false;
    }
    return true;
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/UnassignProcedure.java
@ -249,12 +249,17 @@ public class UnassignProcedure extends RegionTransitionProcedure {
      final IOException exception) {
    // TODO: Is there on-going rpc to cleanup?
    if (exception instanceof ServerCrashException) {
-      // This exception comes from ServerCrashProcedure after it is done with log splitting.
-      // SCP found this region as a Region-In-Transition (RIT). Its call into here says it is ok to
-      // let this procedure go on to a complete close now. This will release lock on this region so
-      // subsequent action on region can succeed; e.g. the assign that follows this unassign when
-      // a move (w/o wait on SCP the assign could run w/o logs being split so data loss).
-      reportTransitionCLOSED(env, regionNode);
+      // This exception comes from ServerCrashProcedure after log splitting.
+      // SCP found this region as a RIT. Its call into here says it is ok to let this procedure go
+      // on to a complete close now. This will release lock on this region so subsequent action on
+      // region can succeed; e.g. the assign that follows this unassign when a move (w/o wait on SCP
+      // the assign could run w/o logs being split so data loss).
+      try {
+        reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM);
+      } catch (UnexpectedStateException e) {
+        // Should never happen.
+        throw new RuntimeException(e);
+      }
    } else if (exception instanceof RegionServerAbortedException ||
        exception instanceof RegionServerStoppedException ||
        exception instanceof ServerNotRunningYetException) {
@ -268,33 +273,17 @@ public class UnassignProcedure extends RegionTransitionProcedure {
        exception);
      setTransitionState(RegionTransitionState.REGION_TRANSITION_FINISH);
    } else {
-      LOG.warn("Expiring server {}; rit={}, exception={}", this, regionNode.getState(),
-          exception.toString());
-      if (env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation())) {
-        // Return false so this procedure stays in suspended state. It will be woken up by
-        // ServerCrashProcedure when it notices this RIT and calls this method again but with
-        // a SCPException -- see above.
-        // TODO: Add a SCP as a new subprocedure that we now come to depend on.
-        return false;
-      } else {
-        LOG.warn("Failed expire of {}; presumed CRASHED; moving region to CLOSED state",
-            regionNode.getRegionLocation());
-        reportTransitionCLOSED(env, regionNode);
-      }
+      LOG.warn("Expiring server " + this + "; " + regionNode.toShortString() +
+        ", exception=" + exception);
+      env.getMasterServices().getServerManager().expireServer(regionNode.getRegionLocation());
+      // Return false so this procedure stays in suspended state. It will be woken up by a
+      // ServerCrashProcedure when it notices this RIT.
+      // TODO: Add a SCP as a new subprocedure that we now come to depend on.
+      return false;
    }
    return true;
  }

-  private void reportTransitionCLOSED(final MasterProcedureEnv env,
-      final RegionStateNode regionNode) {
-    try {
-      reportTransition(env, regionNode, TransitionCode.CLOSED, HConstants.NO_SEQNUM);
-    } catch (UnexpectedStateException e) {
-      // Should never happen.
-      throw new RuntimeException(e);
-    }
-  }
-
  @Override
  public void toStringClassDetails(StringBuilder sb) {
    super.toStringClassDetails(sb);
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java
@ -712,7 +712,7 @@ public class TestAssignmentManager {
  }

  private class HangOnCloseThenRSCrashExecutor extends GoodRsExecutor {
-    public static final int TYPES_OF_FAILURE = 7;
+    public static final int TYPES_OF_FAILURE = 6;
    private int invocations;

    @Override
@ -724,14 +724,6 @@ public class TestAssignmentManager {
      case 2: throw new RegionServerStoppedException("Fake!");
      case 3: throw new ServerNotRunningYetException("Fake!");
      case 4:
-        // We will expire the server that we failed to rpc against.
-        throw new FailedRemoteDispatchException("Fake!");
-      case 5:
-        // Mark this regionserver as already expiring so we go different code route; i.e. we
-        // FAIL to expire the remote server and presume ok to move region to CLOSED. HBASE-20137.
-        TestAssignmentManager.this.master.getServerManager().expireServer(server);
-        throw new FailedRemoteDispatchException("Fake!");
-      case 6:
        LOG.info("Return null response from serverName=" + server + "; means STUCK...TODO timeout");
        executor.schedule(new Runnable() {
          @Override