diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index 2971643d8b3..b4548f61239 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -215,7 +215,7 @@ public class HBaseFsck extends Configured implements Closeable { private Table meta; // threads to do ||izable tasks: retrieve data from regionservers, handle overlapping regions protected ExecutorService executor; - private long startMillis = System.currentTimeMillis(); + private long startMillis = EnvironmentEdgeManager.currentTime(); private HFileCorruptionChecker hfcc; private int retcode = 0; private Path HBCK_LOCK_PATH; @@ -297,6 +297,7 @@ public class HBaseFsck extends Configured implements Closeable { private Map> orphanTableDirs = new HashMap>(); + private Map> skippedRegions = new HashMap>(); /** * List of orphaned table ZNodes @@ -566,6 +567,7 @@ public class HBaseFsck extends Configured implements Closeable { errors.clear(); tablesInfo.clear(); orphanHdfsDirs.clear(); + skippedRegions.clear(); } /** @@ -1717,7 +1719,7 @@ public class HBaseFsck extends Configured implements Closeable { return false; } ServerName sn = metaLocation.getServerName(); - MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis()); + MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, EnvironmentEdgeManager.currentTime()); HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName()); if (hbckInfo == null) { regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m)); @@ -1815,6 +1817,17 @@ public class HBaseFsck extends Configured implements Closeable { } checkRegionConsistencyConcurrently(replicaWorkItems); setCheckHdfs(prevHdfsCheck); + + // If some regions is skipped during checkRegionConsistencyConcurrently() phase, we might + // not get accurate state of the hbase if continuing. The config here allows users to tune + // the tolerance of number of skipped region. + // TODO: evaluate the consequence to continue the hbck operation without config. + int terminateThreshold = getConf().getInt("hbase.hbck.skipped.regions.limit", 0); + int numOfSkippedRegions = skippedRegions.size(); + if (numOfSkippedRegions > 0 && numOfSkippedRegions > terminateThreshold) { + throw new IOException(numOfSkippedRegions + + " region(s) could not be checked or repaired. See logs for detail."); + } } /** @@ -1857,11 +1870,32 @@ public class HBaseFsck extends Configured implements Closeable { @Override public synchronized Void call() throws Exception { - checkRegionConsistency(key, hbi); + try { + checkRegionConsistency(key, hbi); + } catch (Exception e) { + // If the region is non-META region, skip this region and send warning/error message; if + // the region is META region, we should not continue. + LOG.warn("Unable to complete check or repair the region '" + hbi.getRegionNameAsString() + + "'.", e); + if (hbi.getHdfsHRI().isMetaRegion()) { + throw e; + } + LOG.warn("Skip region '" + hbi.getRegionNameAsString() + "'"); + addSkippedRegion(hbi); + } return null; } } - + + private void addSkippedRegion(final HbckInfo hbi) { + Set skippedRegionNames = skippedRegions.get(hbi.getTableName()); + if (skippedRegionNames == null) { + skippedRegionNames = new HashSet(); + } + skippedRegionNames.add(hbi.getRegionNameAsString()); + skippedRegions.put(hbi.getTableName(), skippedRegionNames); + } + private void preCheckPermission() throws IOException, AccessDeniedException { if (shouldIgnorePreCheckPermission()) { return; @@ -2106,7 +2140,7 @@ public class HBaseFsck extends Configured implements Closeable { (hbi.metaEntry == null)? false: hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline(); boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry); boolean recentlyModified = inHdfs && - hbi.getModTime() + timelag > System.currentTimeMillis(); + hbi.getModTime() + timelag > EnvironmentEdgeManager.currentTime(); // ========== First the healthy cases ============= if (hbi.containsOnlyHdfsEdits()) { @@ -3113,7 +3147,7 @@ public class HBaseFsck extends Configured implements Closeable { */ HTableDescriptor[] getTables(AtomicInteger numSkipped) { List tableNames = new ArrayList(); - long now = System.currentTimeMillis(); + long now = EnvironmentEdgeManager.currentTime(); for (HbckInfo hbi : regionInfoMap.values()) { MetaEntry info = hbi.metaEntry; @@ -3716,14 +3750,30 @@ public class HBaseFsck extends Configured implements Closeable { */ private void printTableSummary(SortedMap tablesInfo) { StringBuilder sb = new StringBuilder(); + int numOfSkippedRegions; errors.print("Summary:"); for (TableInfo tInfo : tablesInfo.values()) { + numOfSkippedRegions = (skippedRegions.containsKey(tInfo.getName())) ? + skippedRegions.get(tInfo.getName()).size() : 0; + if (errors.tableHasErrors(tInfo)) { errors.print("Table " + tInfo.getName() + " is inconsistent."); - } else { - errors.print(" " + tInfo.getName() + " is okay."); + } else if (numOfSkippedRegions > 0){ + errors.print("Table " + tInfo.getName() + " is okay (with " + + numOfSkippedRegions + " skipped regions)."); + } + else { + errors.print("Table " + tInfo.getName() + " is okay."); } errors.print(" Number of regions: " + tInfo.getNumRegions()); + if (numOfSkippedRegions > 0) { + Set skippedRegionStrings = skippedRegions.get(tInfo.getName()); + System.out.println(" Number of skipped regions: " + numOfSkippedRegions); + System.out.println(" List of skipped regions:"); + for(String sr : skippedRegionStrings) { + System.out.println(" " + sr); + } + } sb.setLength(0); // clear out existing buffer, if any. sb.append(" Deployed on: "); for (ServerName server : tInfo.deployedOn) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java index d21bda1f422..7d8f67fd544 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java @@ -31,7 +31,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MetaTableAccessor; -import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.ZooKeeperConnectionException; @@ -41,7 +40,6 @@ import org.apache.hadoop.hbase.client.ClusterConnection; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.HConnection; -import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.master.RegionState; @@ -129,8 +127,8 @@ public class HBaseFsckRepair { public static void waitUntilAssigned(Admin admin, HRegionInfo region) throws IOException, InterruptedException { long timeout = admin.getConfiguration().getLong("hbase.hbck.assign.timeout", 120000); - long expiration = timeout + System.currentTimeMillis(); - while (System.currentTimeMillis() < expiration) { + long expiration = timeout + EnvironmentEdgeManager.currentTime(); + while (EnvironmentEdgeManager.currentTime() < expiration) { try { Map rits= admin.getClusterStatus().getRegionsInTransition(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index b1f7427fdfe..f8ea4abc866 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -111,6 +111,7 @@ import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.SplitTransactionImpl; import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction; +import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter; import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE; import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;