diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index 46fad964a37..cf4d002455e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -211,7 +211,7 @@ public class HBaseFsck extends Configured implements Closeable { private Table meta; // threads to do ||izable tasks: retrieve data from regionservers, handle overlapping regions protected ExecutorService executor; - private long startMillis = System.currentTimeMillis(); + private long startMillis = EnvironmentEdgeManager.currentTime(); private HFileCorruptionChecker hfcc; private int retcode = 0; private Path HBCK_LOCK_PATH; @@ -294,6 +294,7 @@ public class HBaseFsck extends Configured implements Closeable { new HashMap(); private final RetryCounterFactory lockFileRetryCounterFactory; + private Map> skippedRegions = new HashMap>(); /** * Constructor @@ -556,6 +557,7 @@ public class HBaseFsck extends Configured implements Closeable { errors.clear(); tablesInfo.clear(); orphanHdfsDirs.clear(); + skippedRegions.clear(); } /** @@ -1717,7 +1719,7 @@ public class HBaseFsck extends Configured implements Closeable { return false; } ServerName sn = metaLocation.getServerName(); - MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, System.currentTimeMillis()); + MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, EnvironmentEdgeManager.currentTime()); HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName()); if (hbckInfo == null) { regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m)); @@ -1817,6 +1819,17 @@ public class HBaseFsck extends Configured implements Closeable { checkRegionConsistencyConcurrently(replicaWorkItems); setCheckHdfs(prevHdfsCheck); + // If some regions is skipped during checkRegionConsistencyConcurrently() phase, we might + // not get accurate state of the hbase if continuing. The config here allows users to tune + // the tolerance of number of skipped region. + // TODO: evaluate the consequence to continue the hbck operation without config. + int terminateThreshold = getConf().getInt("hbase.hbck.skipped.regions.limit", 0); + int numOfSkippedRegions = skippedRegions.size(); + if (numOfSkippedRegions > 0 && numOfSkippedRegions > terminateThreshold) { + throw new IOException(numOfSkippedRegions + + " region(s) could not be checked or repaired. See logs for detail."); + } + if (shouldCheckHdfs()) { checkAndFixTableStates(); } @@ -1862,11 +1875,32 @@ public class HBaseFsck extends Configured implements Closeable { @Override public synchronized Void call() throws Exception { - checkRegionConsistency(key, hbi); + try { + checkRegionConsistency(key, hbi); + } catch (Exception e) { + // If the region is non-META region, skip this region and send warning/error message; if + // the region is META region, we should not continue. + LOG.warn("Unable to complete check or repair the region '" + hbi.getRegionNameAsString() + + "'.", e); + if (hbi.getHdfsHRI().isMetaRegion()) { + throw e; + } + LOG.warn("Skip region '" + hbi.getRegionNameAsString() + "'"); + addSkippedRegion(hbi); + } return null; } } + private void addSkippedRegion(final HbckInfo hbi) { + Set skippedRegionNames = skippedRegions.get(hbi.getTableName()); + if (skippedRegionNames == null) { + skippedRegionNames = new HashSet(); + } + skippedRegionNames.add(hbi.getRegionNameAsString()); + skippedRegions.put(hbi.getTableName(), skippedRegionNames); + } + /** * Check and fix table states, assumes full info available: * - tableInfos @@ -2156,7 +2190,7 @@ public class HBaseFsck extends Configured implements Closeable { inMeta && hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline(); boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry.getTable()); boolean recentlyModified = inHdfs && - hbi.getModTime() + timelag > System.currentTimeMillis(); + hbi.getModTime() + timelag > EnvironmentEdgeManager.currentTime(); // ========== First the healthy cases ============= if (hbi.containsOnlyHdfsEdits()) { @@ -3161,7 +3195,7 @@ public class HBaseFsck extends Configured implements Closeable { */ HTableDescriptor[] getTables(AtomicInteger numSkipped) { List tableNames = new ArrayList(); - long now = System.currentTimeMillis(); + long now = EnvironmentEdgeManager.currentTime(); for (HbckInfo hbi : regionInfoMap.values()) { MetaEntry info = hbi.metaEntry; @@ -3697,14 +3731,30 @@ public class HBaseFsck extends Configured implements Closeable { */ private void printTableSummary(SortedMap tablesInfo) { StringBuilder sb = new StringBuilder(); + int numOfSkippedRegions; errors.print("Summary:"); for (TableInfo tInfo : tablesInfo.values()) { + numOfSkippedRegions = (skippedRegions.containsKey(tInfo.getName())) ? + skippedRegions.get(tInfo.getName()).size() : 0; + if (errors.tableHasErrors(tInfo)) { errors.print("Table " + tInfo.getName() + " is inconsistent."); - } else { - errors.print(" " + tInfo.getName() + " is okay."); + } else if (numOfSkippedRegions > 0){ + errors.print("Table " + tInfo.getName() + " is okay (with " + + numOfSkippedRegions + " skipped regions)."); + } + else { + errors.print("Table " + tInfo.getName() + " is okay."); } errors.print(" Number of regions: " + tInfo.getNumRegions()); + if (numOfSkippedRegions > 0) { + Set skippedRegionStrings = skippedRegions.get(tInfo.getName()); + System.out.println(" Number of skipped regions: " + numOfSkippedRegions); + System.out.println(" List of skipped regions:"); + for(String sr : skippedRegionStrings) { + System.out.println(" " + sr); + } + } sb.setLength(0); // clear out existing buffer, if any. sb.append(" Deployed on: "); for (ServerName server : tInfo.deployedOn) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java index 4c742e32454..7de7af8f143 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsckRepair.java @@ -25,7 +25,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MetaTableAccessor; -import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.ZooKeeperConnectionException; @@ -35,7 +34,6 @@ import org.apache.hadoop.hbase.client.ClusterConnection; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.HConnection; -import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.master.RegionState; @@ -123,8 +121,8 @@ public class HBaseFsckRepair { public static void waitUntilAssigned(Admin admin, HRegionInfo region) throws IOException, InterruptedException { long timeout = admin.getConfiguration().getLong("hbase.hbck.assign.timeout", 120000); - long expiration = timeout + System.currentTimeMillis(); - while (System.currentTimeMillis() < expiration) { + long expiration = timeout + EnvironmentEdgeManager.currentTime(); + while (EnvironmentEdgeManager.currentTime() < expiration) { try { Map rits= admin.getClusterStatus().getRegionsInTransition(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index fa160686d37..28b80fff452 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -103,7 +103,6 @@ import org.apache.hadoop.hbase.regionserver.HRegionServer; import org.apache.hadoop.hbase.regionserver.SplitTransactionFactory; import org.apache.hadoop.hbase.regionserver.SplitTransactionImpl; import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction; -import org.apache.hadoop.hbase.security.access.AccessControlClient; import org.apache.hadoop.hbase.testclassification.LargeTests; import org.apache.hadoop.hbase.testclassification.MiscTests; import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;