HBASE-13576 HBCK enhancement: Failure in checking one region should not fail the entire HBCK operation. (Stephen Yuan Jiang)

This commit is contained in:
Enis Soztutar 2015-05-06 12:08:36 -07:00
parent 67c6352e8a
commit 11b76732c0
3 changed files with 59 additions and 12 deletions

View File

@ -211,7 +211,7 @@ public class HBaseFsck extends Configured implements Closeable {
private Table meta;
// threads to do ||izable tasks: retrieve data from regionservers, handle overlapping regions
protected ExecutorService executor;
private long startMillis = System.currentTimeMillis();
private long startMillis = EnvironmentEdgeManager.currentTime();
private HFileCorruptionChecker hfcc;
private int retcode = 0;
private Path HBCK_LOCK_PATH;
@ -294,6 +294,7 @@ public class HBaseFsck extends Configured implements Closeable {
new HashMap<TableName, TableState>();
private final RetryCounterFactory lockFileRetryCounterFactory;
private Map<TableName, Set<String>> skippedRegions = new HashMap<TableName, Set<String>>();
/**
* Constructor
@ -556,6 +557,7 @@ public class HBaseFsck extends Configured implements Closeable {
errors.clear();
tablesInfo.clear();
orphanHdfsDirs.clear();
skippedRegions.clear();
}
/**
@ -1717,7 +1719,7 @@ public class HBaseFsck extends Configured implements Closeable {
return false;
}
ServerName sn = metaLocation.getServerName();
MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis());
MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, EnvironmentEdgeManager.currentTime());
HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName());
if (hbckInfo == null) {
regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m));
@ -1817,6 +1819,17 @@ public class HBaseFsck extends Configured implements Closeable {
checkRegionConsistencyConcurrently(replicaWorkItems);
setCheckHdfs(prevHdfsCheck);
// If some regions is skipped during checkRegionConsistencyConcurrently() phase, we might
// not get accurate state of the hbase if continuing. The config here allows users to tune
// the tolerance of number of skipped region.
// TODO: evaluate the consequence to continue the hbck operation without config.
int terminateThreshold = getConf().getInt("hbase.hbck.skipped.regions.limit", 0);
int numOfSkippedRegions = skippedRegions.size();
if (numOfSkippedRegions > 0 && numOfSkippedRegions > terminateThreshold) {
throw new IOException(numOfSkippedRegions
+ " region(s) could not be checked or repaired. See logs for detail.");
}
if (shouldCheckHdfs()) {
checkAndFixTableStates();
}
@ -1862,11 +1875,32 @@ public class HBaseFsck extends Configured implements Closeable {
@Override
public synchronized Void call() throws Exception {
checkRegionConsistency(key, hbi);
try {
checkRegionConsistency(key, hbi);
} catch (Exception e) {
// If the region is non-META region, skip this region and send warning/error message; if
// the region is META region, we should not continue.
LOG.warn("Unable to complete check or repair the region '" + hbi.getRegionNameAsString()
+ "'.", e);
if (hbi.getHdfsHRI().isMetaRegion()) {
throw e;
}
LOG.warn("Skip region '" + hbi.getRegionNameAsString() + "'");
addSkippedRegion(hbi);
}
return null;
}
}
private void addSkippedRegion(final HbckInfo hbi) {
Set<String> skippedRegionNames = skippedRegions.get(hbi.getTableName());
if (skippedRegionNames == null) {
skippedRegionNames = new HashSet<String>();
}
skippedRegionNames.add(hbi.getRegionNameAsString());
skippedRegions.put(hbi.getTableName(), skippedRegionNames);
}
/**
* Check and fix table states, assumes full info available:
* - tableInfos
@ -2156,7 +2190,7 @@ public class HBaseFsck extends Configured implements Closeable {
inMeta && hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry.getTable());
boolean recentlyModified = inHdfs &&
hbi.getModTime() + timelag > System.currentTimeMillis();
hbi.getModTime() + timelag > EnvironmentEdgeManager.currentTime();
// ========== First the healthy cases =============
if (hbi.containsOnlyHdfsEdits()) {
@ -3161,7 +3195,7 @@ public class HBaseFsck extends Configured implements Closeable {
*/
HTableDescriptor[] getTables(AtomicInteger numSkipped) {
List<TableName> tableNames = new ArrayList<TableName>();
long now = System.currentTimeMillis();
long now = EnvironmentEdgeManager.currentTime();
for (HbckInfo hbi : regionInfoMap.values()) {
MetaEntry info = hbi.metaEntry;
@ -3697,14 +3731,30 @@ public class HBaseFsck extends Configured implements Closeable {
*/
private void printTableSummary(SortedMap<TableName, TableInfo> tablesInfo) {
StringBuilder sb = new StringBuilder();
int numOfSkippedRegions;
errors.print("Summary:");
for (TableInfo tInfo : tablesInfo.values()) {
numOfSkippedRegions = (skippedRegions.containsKey(tInfo.getName())) ?
skippedRegions.get(tInfo.getName()).size() : 0;
if (errors.tableHasErrors(tInfo)) {
errors.print("Table " + tInfo.getName() + " is inconsistent.");
} else {
errors.print(" " + tInfo.getName() + " is okay.");
} else if (numOfSkippedRegions > 0){
errors.print("Table " + tInfo.getName() + " is okay (with "
+ numOfSkippedRegions + " skipped regions).");
}
else {
errors.print("Table " + tInfo.getName() + " is okay.");
}
errors.print(" Number of regions: " + tInfo.getNumRegions());
if (numOfSkippedRegions > 0) {
Set<String> skippedRegionStrings = skippedRegions.get(tInfo.getName());
System.out.println(" Number of skipped regions: " + numOfSkippedRegions);
System.out.println(" List of skipped regions:");
for(String sr : skippedRegionStrings) {
System.out.println(" " + sr);
}
}
sb.setLength(0); // clear out existing buffer, if any.
sb.append(" Deployed on: ");
for (ServerName server : tInfo.deployedOn) {

View File

@ -25,7 +25,6 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
@ -35,7 +34,6 @@ import org.apache.hadoop.hbase.client.ClusterConnection;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.master.RegionState;
@ -123,8 +121,8 @@ public class HBaseFsckRepair {
public static void waitUntilAssigned(Admin admin,
HRegionInfo region) throws IOException, InterruptedException {
long timeout = admin.getConfiguration().getLong("hbase.hbck.assign.timeout", 120000);
long expiration = timeout + System.currentTimeMillis();
while (System.currentTimeMillis() < expiration) {
long expiration = timeout + EnvironmentEdgeManager.currentTime();
while (EnvironmentEdgeManager.currentTime() < expiration) {
try {
Map<String, RegionState> rits=
admin.getClusterStatus().getRegionsInTransition();

View File

@ -103,7 +103,6 @@ import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.SplitTransactionFactory;
import org.apache.hadoop.hbase.regionserver.SplitTransactionImpl;
import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
import org.apache.hadoop.hbase.security.access.AccessControlClient;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MiscTests;
import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;