HBASE-24368 Let HBCKSCP clear 'Unknown Servers', even if RegionStateNode has RegionLocation == null
hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/GCMultipleMergedRegionsProcedure.java Edit a log. hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java Add override of isMatchingRegionLocation. Allow 'null' as a pass in HBCKSCP. hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java Add a method for HBCKSCP to override and be less strict filtering assigns. hbase-server/src/main/resources/hbase-webapps/master/hbck.jsp Some doc on what 'Unknown Servers' are.
This commit is contained in:
parent
941cef71c2
commit
2e4be3e77d
|
@ -99,12 +99,11 @@ public class GCMultipleMergedRegionsProcedure extends
|
||||||
case GC_MERGED_REGIONS_PREPARE:
|
case GC_MERGED_REGIONS_PREPARE:
|
||||||
// If GCMultipleMergedRegionsProcedure processing is slower than the CatalogJanitor's scan
|
// If GCMultipleMergedRegionsProcedure processing is slower than the CatalogJanitor's scan
|
||||||
// interval, it will end resubmitting GCMultipleMergedRegionsProcedure for the same
|
// interval, it will end resubmitting GCMultipleMergedRegionsProcedure for the same
|
||||||
// region, we can skip duplicate GCMultipleMergedRegionsProcedure while previous finished
|
// region. We can skip duplicate GCMultipleMergedRegionsProcedure while previous finished
|
||||||
List<RegionInfo> parents = MetaTableAccessor.getMergeRegions(
|
List<RegionInfo> parents = MetaTableAccessor.getMergeRegions(
|
||||||
env.getMasterServices().getConnection(), mergedChild.getRegionName());
|
env.getMasterServices().getConnection(), mergedChild.getRegionName());
|
||||||
if (parents == null || parents.isEmpty()) {
|
if (parents == null || parents.isEmpty()) {
|
||||||
LOG.info("Region=" + mergedChild.getShortNameToLog()
|
LOG.info("{} mergeXXX qualifiers have ALL been deleted", mergedChild.getShortNameToLog());
|
||||||
+ " info:merge qualifier has been deleted");
|
|
||||||
return Flow.NO_MORE_STATE;
|
return Flow.NO_MORE_STATE;
|
||||||
}
|
}
|
||||||
setNextState(GCMergedRegionsState.GC_MERGED_REGIONS_PURGE);
|
setNextState(GCMergedRegionsState.GC_MERGED_REGIONS_PURGE);
|
||||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.client.Connection;
|
||||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||||
import org.apache.hadoop.hbase.client.Result;
|
import org.apache.hadoop.hbase.client.Result;
|
||||||
import org.apache.hadoop.hbase.master.RegionState;
|
import org.apache.hadoop.hbase.master.RegionState;
|
||||||
|
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||||
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
|
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
|
||||||
import org.apache.yetus.audience.InterfaceAudience;
|
import org.apache.yetus.audience.InterfaceAudience;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -168,4 +169,16 @@ public class HBCKServerCrashProcedure extends ServerCrashProcedure {
|
||||||
return this.reassigns;
|
return this.reassigns;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The RegionStateNode will not have a location if a confirm of an OPEN fails. On fail,
|
||||||
|
* the RegionStateNode regionLocation is set to null. This is 'looser' than the test done
|
||||||
|
* in the superclass. The HBCKSCP has been scheduled by an operator via hbck2 probably at the
|
||||||
|
* behest of a report of an 'Unknown Server' in the 'HBCK Report'. Let the operators operation
|
||||||
|
* succeed even in case where the region location in the RegionStateNode is null.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected boolean isMatchingRegionLocation(RegionStateNode rsn) {
|
||||||
|
return super.isMatchingRegionLocation(rsn) || rsn.getRegionLocation() == null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -450,6 +450,15 @@ public class ServerCrashProcedure
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Moved out here so can be overridden by the HBCK fix-up SCP to be less strict about what
|
||||||
|
* it will tolerate as a 'match'.
|
||||||
|
* @return True if the region location in <code>rsn</code> matches that of this crashed server.
|
||||||
|
*/
|
||||||
|
protected boolean isMatchingRegionLocation(RegionStateNode rsn) {
|
||||||
|
return this.serverName.equals(rsn.getRegionLocation());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Assign the regions on the crashed RS to other Rses.
|
* Assign the regions on the crashed RS to other Rses.
|
||||||
* <p/>
|
* <p/>
|
||||||
|
@ -467,14 +476,17 @@ public class ServerCrashProcedure
|
||||||
regionNode.lock();
|
regionNode.lock();
|
||||||
try {
|
try {
|
||||||
// This is possible, as when a server is dead, TRSP will fail to schedule a RemoteProcedure
|
// This is possible, as when a server is dead, TRSP will fail to schedule a RemoteProcedure
|
||||||
// to us and then try to assign the region to a new RS. And before it has updated the region
|
// and then try to assign the region to a new RS. And before it has updated the region
|
||||||
// location to the new RS, we may have already called the am.getRegionsOnServer so we will
|
// location to the new RS, we may have already called the am.getRegionsOnServer so we will
|
||||||
// consider the region is still on us. And then before we arrive here, the TRSP could have
|
// consider the region is still on this crashed server. Then before we arrive here, the
|
||||||
// updated the region location, or even finished itself, so the region is no longer on us
|
// TRSP could have updated the region location, or even finished itself, so the region is
|
||||||
// any more, we should not try to assign it again. Please see HBASE-23594 for more details.
|
// no longer on this crashed server any more. We should not try to assign it again. Please
|
||||||
if (!serverName.equals(regionNode.getRegionLocation())) {
|
// see HBASE-23594 for more details.
|
||||||
LOG.info("{} found a region {} which is no longer on us {}, give up assigning...", this,
|
// UPDATE: HBCKServerCrashProcedure overrides isMatchingRegionLocation; this check can get
|
||||||
regionNode, serverName);
|
// in the way of our clearing out 'Unknown Servers'.
|
||||||
|
if (!isMatchingRegionLocation(regionNode)) {
|
||||||
|
LOG.info("{} found {} whose regionLocation no longer matches {}, skipping assign...",
|
||||||
|
this, regionNode, serverName);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (regionNode.getProcedure() != null) {
|
if (regionNode.getProcedure() != null) {
|
||||||
|
|
|
@ -112,8 +112,7 @@
|
||||||
need to check the server still exists. If not, schedule <em>ServerCrashProcedure</em> for it. If exists,
|
need to check the server still exists. If not, schedule <em>ServerCrashProcedure</em> for it. If exists,
|
||||||
restart Server2 and Server1):
|
restart Server2 and Server1):
|
||||||
3. More than one regionserver reports opened this region (Fix: restart the RegionServers).
|
3. More than one regionserver reports opened this region (Fix: restart the RegionServers).
|
||||||
Notice: the reported online regionservers may be not right when there are regions in transition.
|
Note: the reported online regionservers may be not be up-to-date when there are regions in transition.
|
||||||
Please check them in regionserver's web UI.
|
|
||||||
</span>
|
</span>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
@ -165,8 +164,9 @@
|
||||||
</div>
|
</div>
|
||||||
<p>
|
<p>
|
||||||
<span>
|
<span>
|
||||||
The below are Regions we've lost account of. To be safe, run bulk load of any data found in these Region orphan directories back into the HBase cluster.
|
The below are Regions we've lost account of. To be safe, run bulk load of any data found under these Region orphan directories to have the
|
||||||
First make sure <em>hbase:meta</em> is in a healthy state, that there are no holes, overlaps or inconsistencies (else bulk load may complain);
|
cluster re-adopt data.
|
||||||
|
First make sure <em>hbase:meta</em> is in a healthy state, that there are no holes, overlaps or inconsistencies (else bulk load may fail);
|
||||||
run <em>hbck2 fixMeta</em>. Once this is done, per Region below, run a bulk
|
run <em>hbck2 fixMeta</em>. Once this is done, per Region below, run a bulk
|
||||||
load -- <em>$ hbase completebulkload REGION_DIR_PATH TABLE_NAME</em> -- and then delete the desiccated directory content (HFiles are removed upon
|
load -- <em>$ hbase completebulkload REGION_DIR_PATH TABLE_NAME</em> -- and then delete the desiccated directory content (HFiles are removed upon
|
||||||
successful load; all that is left are empty directories and occasionally a seqid marking file).
|
successful load; all that is left are empty directories and occasionally a seqid marking file).
|
||||||
|
@ -259,6 +259,21 @@
|
||||||
<h2>Unknown Servers</h2>
|
<h2>Unknown Servers</h2>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
<p>
|
||||||
|
<span>The below are servers mentioned in the hbase:meta table that are no longer 'live' or known 'dead'.
|
||||||
|
The server likely belongs to an older cluster epoch since replaced by a new instance because of a restart/crash.
|
||||||
|
To clear 'Unknown Servers', run 'hbck2 scheduleRecoveries UNKNOWN_SERVERNAME'. This will schedule a ServerCrashProcedure.
|
||||||
|
It will clear out 'Unknown Server' references and schedule reassigns of any Regions that were associated with this host.
|
||||||
|
But first!, be sure the referenced Region is not currently stuck looping trying to OPEN. Does it show as a Region-In-Transition on the
|
||||||
|
Master home page? Is it mentioned in the 'Procedures and Locks' Procedures list? If so, perhaps it stuck in a loop
|
||||||
|
trying to OPEN but unable to because of a missing reference or file.
|
||||||
|
Read the Master log looking for the most recent
|
||||||
|
mentions of the associated Region name. Try and address any such complaint first. If successful, a side-effect
|
||||||
|
should be the clean up of the 'Unknown Servers' list. It may take a while. OPENs are retried forever but the interval
|
||||||
|
between retries grows. The 'Unknown Server' may be cleared because it is just the last RegionServer the Region was
|
||||||
|
successfully opened on; on the next open, the 'Unknown Server' will be purged.
|
||||||
|
</span>
|
||||||
|
</p>
|
||||||
<table class="table table-striped">
|
<table class="table table-striped">
|
||||||
<tr>
|
<tr>
|
||||||
<th>RegionInfo</th>
|
<th>RegionInfo</th>
|
||||||
|
|
Loading…
Reference in New Issue