HBASE-24368 Let HBCKSCP clear 'Unknown Servers', even if RegionStateNode has RegionLocation == null

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java Edit a log. hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java Add override of isMatchingRegionLocation. Allow 'null' as a pass in HBCKSCP. hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java Add a method for HBCKSCP to override and be less strict filtering assigns. hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp Some doc on what 'Unknown Servers' are.
2020-05-13 22:19:25 -07:00 · 2020-05-13 22:19:25 -07:00 · 2e4be3e77d
parent 941cef71c2
commit 2e4be3e77d
4 changed files with 53 additions and 14 deletions
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java
@ -99,12 +99,11 @@ public class GCMultipleMergedRegionsProcedure extends
        case GC_MERGED_REGIONS_PREPARE:
          // If GCMultipleMergedRegionsProcedure processing is slower than the CatalogJanitor's scan
          // interval, it will end resubmitting GCMultipleMergedRegionsProcedure for the same
-          // region, we can skip duplicate GCMultipleMergedRegionsProcedure while previous finished
+          // region. We can skip duplicate GCMultipleMergedRegionsProcedure while previous finished
          List<RegionInfo> parents = MetaTableAccessor.getMergeRegions(
            env.getMasterServices().getConnection(), mergedChild.getRegionName());
          if (parents == null || parents.isEmpty()) {
-            LOG.info("Region=" + mergedChild.getShortNameToLog()
+            LOG.info("{} mergeXXX qualifiers have ALL been deleted", mergedChild.getShortNameToLog());
                + " info:merge qualifier has been deleted");
            return Flow.NO_MORE_STATE;
          }
          setNextState(GCMergedRegionsState.GC_MERGED_REGIONS_PURGE);
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java
@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.client.Connection;
 import org.apache.hadoop.hbase.client.RegionInfo;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.master.RegionState;
 import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
 import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
@ -168,4 +169,16 @@ public class HBCKServerCrashProcedure extends ServerCrashProcedure {
      return this.reassigns;
    }
  }
  /**
   * The RegionStateNode will not have a location if a confirm of an OPEN fails. On fail,
   * the RegionStateNode regionLocation is set to null. This is 'looser' than the test done
   * in the superclass. The HBCKSCP has been scheduled by an operator via hbck2 probably at the
   * behest of a report of an 'Unknown Server' in the 'HBCK Report'. Let the operators operation
   * succeed even in case where the region location in the RegionStateNode is null.
   */
  @Override
  protected boolean isMatchingRegionLocation(RegionStateNode rsn) {
    return super.isMatchingRegionLocation(rsn) || rsn.getRegionLocation() == null;
  }
 }
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java
@ -450,6 +450,15 @@ public class ServerCrashProcedure
    return false;
  }
  /**
   * Moved out here so can be overridden by the HBCK fix-up SCP to be less strict about what
   * it will tolerate as a 'match'.
   * @return True if the region location in <code>rsn</code> matches that of this crashed server.
   */
  protected boolean isMatchingRegionLocation(RegionStateNode rsn) {
    return this.serverName.equals(rsn.getRegionLocation());
  }
  /**
   * Assign the regions on the crashed RS to other Rses.
   * <p/>
@ -467,14 +476,17 @@ public class ServerCrashProcedure
      regionNode.lock();
      try {
        // This is possible, as when a server is dead, TRSP will fail to schedule a RemoteProcedure
-        // to us and then try to assign the region to a new RS. And before it has updated the region
+        // and then try to assign the region to a new RS. And before it has updated the region
        // location to the new RS, we may have already called the am.getRegionsOnServer so we will
-        // consider the region is still on us. And then before we arrive here, the TRSP could have
+        // consider the region is still on this crashed server. Then before we arrive here, the
-        // updated the region location, or even finished itself, so the region is no longer on us
+        // TRSP could have updated the region location, or even finished itself, so the region is
-        // any more, we should not try to assign it again. Please see HBASE-23594 for more details.
+        // no longer on this crashed server any more. We should not try to assign it again. Please
-        if (!serverName.equals(regionNode.getRegionLocation())) {
+        // see HBASE-23594 for more details.
-          LOG.info("{} found a region {} which is no longer on us {}, give up assigning...", this,
+        // UPDATE: HBCKServerCrashProcedure overrides isMatchingRegionLocation; this check can get
-            regionNode, serverName);
+        // in the way of our clearing out 'Unknown Servers'.
        if (!isMatchingRegionLocation(regionNode)) {
          LOG.info("{} found {} whose regionLocation no longer matches {}, skipping assign...",
            this, regionNode, serverName);
          continue;
        }
        if (regionNode.getProcedure() != null) {
--- a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
+++ b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp
@ -112,8 +112,7 @@
        need to check the server still exists. If not, schedule <em>ServerCrashProcedure</em> for it. If exists,
        restart Server2 and Server1):
        3. More than one regionserver reports opened this region (Fix: restart the RegionServers).
-        Notice: the reported online regionservers may be not right when there are regions in transition.
+        Note: the reported online regionservers may be not be up-to-date when there are regions in transition.
        Please check them in regionserver's web UI.
        </span>
      </p>
@ -165,8 +164,9 @@
  </div>
      <p>
        <span>
-          The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster.
+          The below are Regions we've lost account of. To be safe, run bulk load of any data found under these Region orphan directories to have the
-          First make sure <em>hbase:meta</em> is in a healthy state, that there are no holes, overlaps or inconsistencies (else bulk load may complain);
+          cluster re-adopt data.
          First make sure <em>hbase:meta</em> is in a healthy state, that there are no holes, overlaps or inconsistencies (else bulk load may fail);
          run <em>hbck2 fixMeta</em>. Once this is done, per Region below, run a bulk
          load -- <em>$ hbase completebulkload REGION_DIR_PATH TABLE_NAME</em> -- and then delete the desiccated directory content (HFiles are removed upon
          successful load; all that is left are empty directories and occasionally a seqid marking file).
@ -259,6 +259,21 @@
                <h2>Unknown Servers</h2>
              </div>
            </div>
            <p>
              <span>The below are servers mentioned in the hbase:meta table that are no longer 'live' or known 'dead'.
                The server likely belongs to an older cluster epoch since replaced by a new instance because of a restart/crash.
                To clear 'Unknown Servers', run 'hbck2 scheduleRecoveries UNKNOWN_SERVERNAME'. This will schedule a ServerCrashProcedure.
                It will clear out 'Unknown Server' references and schedule reassigns of any Regions that were associated with this host.
                But first!, be sure the referenced Region is not currently stuck looping trying to OPEN. Does it show as a Region-In-Transition on the
                Master home page? Is it mentioned in the 'Procedures and Locks' Procedures list? If so, perhaps it stuck in a loop
                trying to OPEN but unable to because of a missing reference or file.
                Read the Master log looking for the most recent
                mentions of the associated Region name. Try and address any such complaint first. If successful, a side-effect
                should be the clean up of the 'Unknown Servers' list. It may take a while. OPENs are retried forever but the interval
                between retries grows. The 'Unknown Server' may be cleared because it is just the last RegionServer the Region was
                successfully opened on; on the next open, the 'Unknown Server' will be purged.
              </span>
            </p>
            <table class="table table-striped">
              <tr>
                <th>RegionInfo</th>