From 2e4be3e77db6dc53626b3d99c26699273b91399d Mon Sep 17 00:00:00 2001 From: stack Date: Wed, 13 May 2020 22:19:25 -0700 Subject: [PATCH] HBASE-24368 Let HBCKSCP clear 'Unknown Servers', even if RegionStateNode has RegionLocation == null hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java Edit a log. hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java Add override of isMatchingRegionLocation. Allow 'null' as a pass in HBCKSCP. hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java Add a method for HBCKSCP to override and be less strict filtering assigns. hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp Some doc on what 'Unknown Servers' are. --- .../GCMultipleMergedRegionsProcedure.java | 5 ++-- .../procedure/HBCKServerCrashProcedure.java | 13 ++++++++++ .../procedure/ServerCrashProcedure.java | 26 ++++++++++++++----- .../resources/hbase-webapps/master/hbck.jsp | 23 +++++++++++++--- 4 files changed, 53 insertions(+), 14 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java index 4fc5484aca8..71fcd3544c3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java @@ -99,12 +99,11 @@ public class GCMultipleMergedRegionsProcedure extends case GC_MERGED_REGIONS_PREPARE: // If GCMultipleMergedRegionsProcedure processing is slower than the CatalogJanitor's scan // interval, it will end resubmitting GCMultipleMergedRegionsProcedure for the same - // region, we can skip duplicate GCMultipleMergedRegionsProcedure while previous finished + // region. We can skip duplicate GCMultipleMergedRegionsProcedure while previous finished List parents = MetaTableAccessor.getMergeRegions( env.getMasterServices().getConnection(), mergedChild.getRegionName()); if (parents == null || parents.isEmpty()) { - LOG.info("Region=" + mergedChild.getShortNameToLog() - + " info:merge qualifier has been deleted"); + LOG.info("{} mergeXXX qualifiers have ALL been deleted", mergedChild.getShortNameToLog()); return Flow.NO_MORE_STATE; } setNextState(GCMergedRegionsState.GC_MERGED_REGIONS_PURGE); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java index eec820cc150..a12b853e919 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.master.RegionState; +import org.apache.hadoop.hbase.master.assignment.RegionStateNode; import org.apache.hadoop.hbase.master.assignment.RegionStateStore; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; @@ -168,4 +169,16 @@ public class HBCKServerCrashProcedure extends ServerCrashProcedure { return this.reassigns; } } + + /** + * The RegionStateNode will not have a location if a confirm of an OPEN fails. On fail, + * the RegionStateNode regionLocation is set to null. This is 'looser' than the test done + * in the superclass. The HBCKSCP has been scheduled by an operator via hbck2 probably at the + * behest of a report of an 'Unknown Server' in the 'HBCK Report'. Let the operators operation + * succeed even in case where the region location in the RegionStateNode is null. + */ + @Override + protected boolean isMatchingRegionLocation(RegionStateNode rsn) { + return super.isMatchingRegionLocation(rsn) || rsn.getRegionLocation() == null; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 178343feecd..076c2668d2e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -450,6 +450,15 @@ public class ServerCrashProcedure return false; } + /** + * Moved out here so can be overridden by the HBCK fix-up SCP to be less strict about what + * it will tolerate as a 'match'. + * @return True if the region location in rsn matches that of this crashed server. + */ + protected boolean isMatchingRegionLocation(RegionStateNode rsn) { + return this.serverName.equals(rsn.getRegionLocation()); + } + /** * Assign the regions on the crashed RS to other Rses. *

@@ -467,14 +476,17 @@ public class ServerCrashProcedure regionNode.lock(); try { // This is possible, as when a server is dead, TRSP will fail to schedule a RemoteProcedure - // to us and then try to assign the region to a new RS. And before it has updated the region + // and then try to assign the region to a new RS. And before it has updated the region // location to the new RS, we may have already called the am.getRegionsOnServer so we will - // consider the region is still on us. And then before we arrive here, the TRSP could have - // updated the region location, or even finished itself, so the region is no longer on us - // any more, we should not try to assign it again. Please see HBASE-23594 for more details. - if (!serverName.equals(regionNode.getRegionLocation())) { - LOG.info("{} found a region {} which is no longer on us {}, give up assigning...", this, - regionNode, serverName); + // consider the region is still on this crashed server. Then before we arrive here, the + // TRSP could have updated the region location, or even finished itself, so the region is + // no longer on this crashed server any more. We should not try to assign it again. Please + // see HBASE-23594 for more details. + // UPDATE: HBCKServerCrashProcedure overrides isMatchingRegionLocation; this check can get + // in the way of our clearing out 'Unknown Servers'. + if (!isMatchingRegionLocation(regionNode)) { + LOG.info("{} found {} whose regionLocation no longer matches {}, skipping assign...", + this, regionNode, serverName); continue; } if (regionNode.getProcedure() != null) { diff --git a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp index d90827c4018..f0a2ce17704 100644 --- a/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp +++ b/hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp @@ -112,8 +112,7 @@ need to check the server still exists. If not, schedule ServerCrashProcedure for it. If exists, restart Server2 and Server1): 3. More than one regionserver reports opened this region (Fix: restart the RegionServers). - Notice: the reported online regionservers may be not right when there are regions in transition. - Please check them in regionserver's web UI. + Note: the reported online regionservers may be not be up-to-date when there are regions in transition.

@@ -165,8 +164,9 @@

- The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster. - First make sure hbase:meta is in a healthy state, that there are no holes, overlaps or inconsistencies (else bulk load may complain); + The below are Regions we've lost account of. To be safe, run bulk load of any data found under these Region orphan directories to have the + cluster re-adopt data. + First make sure hbase:meta is in a healthy state, that there are no holes, overlaps or inconsistencies (else bulk load may fail); run hbck2 fixMeta. Once this is done, per Region below, run a bulk load -- $ hbase completebulkload REGION_DIR_PATH TABLE_NAME -- and then delete the desiccated directory content (HFiles are removed upon successful load; all that is left are empty directories and occasionally a seqid marking file). @@ -259,6 +259,21 @@

Unknown Servers

+

+ The below are servers mentioned in the hbase:meta table that are no longer 'live' or known 'dead'. + The server likely belongs to an older cluster epoch since replaced by a new instance because of a restart/crash. + To clear 'Unknown Servers', run 'hbck2 scheduleRecoveries UNKNOWN_SERVERNAME'. This will schedule a ServerCrashProcedure. + It will clear out 'Unknown Server' references and schedule reassigns of any Regions that were associated with this host. + But first!, be sure the referenced Region is not currently stuck looping trying to OPEN. Does it show as a Region-In-Transition on the + Master home page? Is it mentioned in the 'Procedures and Locks' Procedures list? If so, perhaps it stuck in a loop + trying to OPEN but unable to because of a missing reference or file. + Read the Master log looking for the most recent + mentions of the associated Region name. Try and address any such complaint first. If successful, a side-effect + should be the clean up of the 'Unknown Servers' list. It may take a while. OPENs are retried forever but the interval + between retries grows. The 'Unknown Server' may be cleared because it is just the last RegionServer the Region was + successfully opened on; on the next open, the 'Unknown Server' will be purged. + +

RegionInfo