HBASE-13576 HBCK enhancement: Failure in checking one region should not fail the entire HBCK operation. (Stephen Yuan Jiang)
This commit is contained in:
parent
67c6352e8a
commit
11b76732c0
|
@ -211,7 +211,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
private Table meta;
|
||||
// threads to do ||izable tasks: retrieve data from regionservers, handle overlapping regions
|
||||
protected ExecutorService executor;
|
||||
private long startMillis = System.currentTimeMillis();
|
||||
private long startMillis = EnvironmentEdgeManager.currentTime();
|
||||
private HFileCorruptionChecker hfcc;
|
||||
private int retcode = 0;
|
||||
private Path HBCK_LOCK_PATH;
|
||||
|
@ -294,6 +294,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
new HashMap<TableName, TableState>();
|
||||
private final RetryCounterFactory lockFileRetryCounterFactory;
|
||||
|
||||
private Map<TableName, Set<String>> skippedRegions = new HashMap<TableName, Set<String>>();
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
|
@ -556,6 +557,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
errors.clear();
|
||||
tablesInfo.clear();
|
||||
orphanHdfsDirs.clear();
|
||||
skippedRegions.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1717,7 +1719,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
return false;
|
||||
}
|
||||
ServerName sn = metaLocation.getServerName();
|
||||
MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis());
|
||||
MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, EnvironmentEdgeManager.currentTime());
|
||||
HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName());
|
||||
if (hbckInfo == null) {
|
||||
regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m));
|
||||
|
@ -1817,6 +1819,17 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
checkRegionConsistencyConcurrently(replicaWorkItems);
|
||||
setCheckHdfs(prevHdfsCheck);
|
||||
|
||||
// If some regions is skipped during checkRegionConsistencyConcurrently() phase, we might
|
||||
// not get accurate state of the hbase if continuing. The config here allows users to tune
|
||||
// the tolerance of number of skipped region.
|
||||
// TODO: evaluate the consequence to continue the hbck operation without config.
|
||||
int terminateThreshold = getConf().getInt("hbase.hbck.skipped.regions.limit", 0);
|
||||
int numOfSkippedRegions = skippedRegions.size();
|
||||
if (numOfSkippedRegions > 0 && numOfSkippedRegions > terminateThreshold) {
|
||||
throw new IOException(numOfSkippedRegions
|
||||
+ " region(s) could not be checked or repaired. See logs for detail.");
|
||||
}
|
||||
|
||||
if (shouldCheckHdfs()) {
|
||||
checkAndFixTableStates();
|
||||
}
|
||||
|
@ -1862,11 +1875,32 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
|
||||
@Override
|
||||
public synchronized Void call() throws Exception {
|
||||
checkRegionConsistency(key, hbi);
|
||||
try {
|
||||
checkRegionConsistency(key, hbi);
|
||||
} catch (Exception e) {
|
||||
// If the region is non-META region, skip this region and send warning/error message; if
|
||||
// the region is META region, we should not continue.
|
||||
LOG.warn("Unable to complete check or repair the region '" + hbi.getRegionNameAsString()
|
||||
+ "'.", e);
|
||||
if (hbi.getHdfsHRI().isMetaRegion()) {
|
||||
throw e;
|
||||
}
|
||||
LOG.warn("Skip region '" + hbi.getRegionNameAsString() + "'");
|
||||
addSkippedRegion(hbi);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private void addSkippedRegion(final HbckInfo hbi) {
|
||||
Set<String> skippedRegionNames = skippedRegions.get(hbi.getTableName());
|
||||
if (skippedRegionNames == null) {
|
||||
skippedRegionNames = new HashSet<String>();
|
||||
}
|
||||
skippedRegionNames.add(hbi.getRegionNameAsString());
|
||||
skippedRegions.put(hbi.getTableName(), skippedRegionNames);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check and fix table states, assumes full info available:
|
||||
* - tableInfos
|
||||
|
@ -2156,7 +2190,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
inMeta && hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
|
||||
boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry.getTable());
|
||||
boolean recentlyModified = inHdfs &&
|
||||
hbi.getModTime() + timelag > System.currentTimeMillis();
|
||||
hbi.getModTime() + timelag > EnvironmentEdgeManager.currentTime();
|
||||
|
||||
// ========== First the healthy cases =============
|
||||
if (hbi.containsOnlyHdfsEdits()) {
|
||||
|
@ -3161,7 +3195,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
*/
|
||||
HTableDescriptor[] getTables(AtomicInteger numSkipped) {
|
||||
List<TableName> tableNames = new ArrayList<TableName>();
|
||||
long now = System.currentTimeMillis();
|
||||
long now = EnvironmentEdgeManager.currentTime();
|
||||
|
||||
for (HbckInfo hbi : regionInfoMap.values()) {
|
||||
MetaEntry info = hbi.metaEntry;
|
||||
|
@ -3697,14 +3731,30 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
*/
|
||||
private void printTableSummary(SortedMap<TableName, TableInfo> tablesInfo) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int numOfSkippedRegions;
|
||||
errors.print("Summary:");
|
||||
for (TableInfo tInfo : tablesInfo.values()) {
|
||||
numOfSkippedRegions = (skippedRegions.containsKey(tInfo.getName())) ?
|
||||
skippedRegions.get(tInfo.getName()).size() : 0;
|
||||
|
||||
if (errors.tableHasErrors(tInfo)) {
|
||||
errors.print("Table " + tInfo.getName() + " is inconsistent.");
|
||||
} else {
|
||||
errors.print(" " + tInfo.getName() + " is okay.");
|
||||
} else if (numOfSkippedRegions > 0){
|
||||
errors.print("Table " + tInfo.getName() + " is okay (with "
|
||||
+ numOfSkippedRegions + " skipped regions).");
|
||||
}
|
||||
else {
|
||||
errors.print("Table " + tInfo.getName() + " is okay.");
|
||||
}
|
||||
errors.print(" Number of regions: " + tInfo.getNumRegions());
|
||||
if (numOfSkippedRegions > 0) {
|
||||
Set<String> skippedRegionStrings = skippedRegions.get(tInfo.getName());
|
||||
System.out.println(" Number of skipped regions: " + numOfSkippedRegions);
|
||||
System.out.println(" List of skipped regions:");
|
||||
for(String sr : skippedRegionStrings) {
|
||||
System.out.println(" " + sr);
|
||||
}
|
||||
}
|
||||
sb.setLength(0); // clear out existing buffer, if any.
|
||||
sb.append(" Deployed on: ");
|
||||
for (ServerName server : tInfo.deployedOn) {
|
||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.hadoop.fs.Path;
|
|||
import org.apache.hadoop.hbase.HRegionInfo;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.MetaTableAccessor;
|
||||
import org.apache.hadoop.hbase.NotServingRegionException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
|
||||
|
@ -35,7 +34,6 @@ import org.apache.hadoop.hbase.client.ClusterConnection;
|
|||
import org.apache.hadoop.hbase.client.Connection;
|
||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||
import org.apache.hadoop.hbase.client.HConnection;
|
||||
import org.apache.hadoop.hbase.client.HTable;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
|
@ -123,8 +121,8 @@ public class HBaseFsckRepair {
|
|||
public static void waitUntilAssigned(Admin admin,
|
||||
HRegionInfo region) throws IOException, InterruptedException {
|
||||
long timeout = admin.getConfiguration().getLong("hbase.hbck.assign.timeout", 120000);
|
||||
long expiration = timeout + System.currentTimeMillis();
|
||||
while (System.currentTimeMillis() < expiration) {
|
||||
long expiration = timeout + EnvironmentEdgeManager.currentTime();
|
||||
while (EnvironmentEdgeManager.currentTime() < expiration) {
|
||||
try {
|
||||
Map<String, RegionState> rits=
|
||||
admin.getClusterStatus().getRegionsInTransition();
|
||||
|
|
|
@ -103,7 +103,6 @@ import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
|||
import org.apache.hadoop.hbase.regionserver.SplitTransactionFactory;
|
||||
import org.apache.hadoop.hbase.regionserver.SplitTransactionImpl;
|
||||
import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
|
||||
import org.apache.hadoop.hbase.security.access.AccessControlClient;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MiscTests;
|
||||
import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
|
||||
|
|
Loading…
Reference in New Issue