From 04b3f13b38a9fb9a40baa0e7f7647fd3568500d2 Mon Sep 17 00:00:00 2001 From: jxiang Date: Wed, 19 Dec 2012 17:29:39 +0000 Subject: [PATCH] HBASE-7199 hbck should check lingering reference hfile and have option to sideline them automatically git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1423970 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop/hbase/regionserver/StoreFile.java | 3 +- .../apache/hadoop/hbase/util/HBaseFsck.java | 81 ++++++++++++++++++- .../hadoop/hbase/util/TestHBaseFsck.java | 29 +++++++ .../hbase/util/hbck/HbckTestingUtil.java | 5 +- 4 files changed, 112 insertions(+), 6 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java index e82a1015b0f..8beb8cd46cc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java @@ -65,7 +65,6 @@ import org.apache.hadoop.hbase.util.BloomFilter; import org.apache.hadoop.hbase.util.BloomFilterFactory; import org.apache.hadoop.hbase.util.BloomFilterWriter; import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.Writables; import org.apache.hadoop.io.RawComparator; @@ -336,7 +335,7 @@ public class StoreFile { * @return Calculated path to parent region file. * @throws IOException */ - static Path getReferredToFile(final Path p) { + public static Path getReferredToFile(final Path p) { Matcher m = REF_NAME_PARSER.matcher(p.getName()); if (m == null || !m.matches()) { LOG.warn("Failed match of store file name " + p.toString()); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java index 2934bfd4305..0a4d36c8d90 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/HBaseFsck.java @@ -85,6 +85,7 @@ import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.master.MasterFileSystem; import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.regionserver.StoreFile; import org.apache.hadoop.hbase.regionserver.wal.HLogUtil; import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE; @@ -191,6 +192,7 @@ public class HBaseFsck extends Configured implements Tool { private boolean fixTableOrphans = false; // fix fs holes (missing .tableinfo) private boolean fixVersionFile = false; // fix missing hbase.version file in hdfs private boolean fixSplitParents = false; // fix lingering split parents + private boolean fixReferenceFiles = false; // fix lingering reference store file // limit checking/fixes to listed tables, if empty attempt to check/fix all // -ROOT- and .META. are always checked @@ -442,6 +444,8 @@ public class HBaseFsck extends Configured implements Tool { admin.setBalancerRunning(oldBalancer, false); } + offlineReferenceFileRepair(); + // Print table summary printTableSummary(tablesInfo); return errors.summarize(); @@ -597,6 +601,67 @@ public class HBaseFsck extends Configured implements Tool { return errors.getErrorList().size(); } + /** + * Scan all the store file names to find any lingering reference files, + * which refer to some none-exiting files. If "fix" option is enabled, + * any lingering reference file will be sidelined if found. + *

+ * Lingering reference file prevents a region from opening. It has to + * be fixed before a cluster can start properly. + */ + private void offlineReferenceFileRepair() throws IOException { + Configuration conf = getConf(); + Path hbaseRoot = FSUtils.getRootDir(conf); + FileSystem fs = hbaseRoot.getFileSystem(conf); + Map allFiles = FSUtils.getTableStoreFilePathMap(fs, hbaseRoot); + for (Path path: allFiles.values()) { + boolean isReference = false; + try { + isReference = StoreFile.isReference(path); + } catch (Throwable t) { + // Ignore. Some files may not be store files at all. + // For example, files under .oldlogs folder in .META. + // Warning message is already logged by + // StoreFile#isReference. + } + if (!isReference) continue; + + Path referredToFile = StoreFile.getReferredToFile(path); + if (fs.exists(referredToFile)) continue; // good, expected + + // Found a lingering reference file + errors.reportError(ERROR_CODE.LINGERING_REFERENCE_HFILE, + "Found lingering reference file " + path); + if (!shouldFixReferenceFiles()) continue; + + // Now, trying to fix it since requested + boolean success = false; + String pathStr = path.toString(); + + // A reference file path should be like + // ${hbase.rootdir}/table_name/region_id/family_name/referred_file.region_name + // Up 3 directories to get the table folder. + // So the file will be sidelined to a similar folder structure. + int index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR); + for (int i = 0; index > 0 && i < 3; i++) { + index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR, index); + } + if (index > 0) { + Path rootDir = getSidelineDir(); + Path dst = new Path(rootDir, pathStr.substring(index)); + fs.mkdirs(dst.getParent()); + LOG.info("Trying to sildeline reference file" + + path + " to " + dst); + setShouldRerun(); + + success = fs.rename(path, dst); + } + if (!success) { + LOG.error("Failed to sideline reference file " + path); + } + } + } + /** * TODO -- need to add tests for this. */ @@ -2771,7 +2836,7 @@ public class HBaseFsck extends Configured implements Tool { MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE, FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS, HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION, - ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE + ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, LINGERING_REFERENCE_HFILE } public void clear(); public void report(String message); @@ -3204,6 +3269,14 @@ public class HBaseFsck extends Configured implements Tool { return fixSplitParents; } + public void setFixReferenceFiles(boolean shouldFix) { + fixReferenceFiles = shouldFix; + } + + boolean shouldFixReferenceFiles() { + return fixReferenceFiles; + } + public boolean shouldIgnorePreCheckPermission() { return ignorePreCheckPermission; } @@ -3315,6 +3388,7 @@ public class HBaseFsck extends Configured implements Tool { System.err.println(" -maxOverlapsToSideline When fixing region overlaps, allow at most regions to sideline per group. (n=" + DEFAULT_OVERLAPS_TO_SIDELINE +" by default)"); System.err.println(" -fixSplitParents Try to force offline split parents to be online."); System.err.println(" -ignorePreCheckPermission ignore filesystem permission pre-check"); + System.err.println(" -fixReferenceFiles Try to offline lingering reference store files"); System.err.println(""); System.err.println(" Datafile Repair options: (expert features, use with caution!)"); @@ -3324,7 +3398,7 @@ public class HBaseFsck extends Configured implements Tool { System.err.println(""); System.err.println(" Metadata Repair shortcuts"); System.err.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " + - "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps"); + "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps -fixReferenceFiles"); System.err.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles"); setRetCode(-2); @@ -3431,6 +3505,8 @@ public class HBaseFsck extends Configured implements Tool { checkCorruptHFiles = true; } else if (cmd.equals("-sidelineCorruptHFiles")) { sidelineCorruptHFiles = true; + } else if (cmd.equals("-fixReferenceFiles")) { + setFixReferenceFiles(true); } else if (cmd.equals("-repair")) { // this attempts to merge overlapping hdfs regions, needs testing // under load @@ -3443,6 +3519,7 @@ public class HBaseFsck extends Configured implements Tool { setSidelineBigOverlaps(true); setFixSplitParents(false); setCheckHdfs(true); + setFixReferenceFiles(true); } else if (cmd.equals("-repairHoles")) { // this will make all missing hdfs regions available but may lose data setFixHdfsHoles(true); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java index 25509828187..404afa6c7d6 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestHBaseFsck.java @@ -1663,6 +1663,35 @@ public class TestHBaseFsck { doQuarantineTest(table, hbck, 3, 0, 0, 0, 1); } + /** + * Test fixing lingering reference file. + */ + @Test + public void testLingeringReferenceFile() throws Exception { + String table = "testLingeringReferenceFile"; + try { + setupTable(table); + assertEquals(ROWKEYS.length, countRows()); + + // Mess it up by creating a fake reference file + FileSystem fs = FileSystem.get(conf); + Path tableDir= FSUtils.getTablePath(FSUtils.getRootDir(conf), table); + Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0); + Path famDir = new Path(regionDir, FAM_STR); + Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538"); + fs.create(fakeReferenceFile); + + HBaseFsck hbck = doFsck(conf, false); + assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE }); + // fix reference file + doFsck(conf, true); + // check that reference file fixed + assertNoErrors(doFsck(conf, false)); + } finally { + deleteTable(table); + } + } + /** * Test pluggable error reporter. It can be plugged in * from system property or configuration. diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/HbckTestingUtil.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/HbckTestingUtil.java index 343402196f8..99f4f9b8a1c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/HbckTestingUtil.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/HbckTestingUtil.java @@ -38,13 +38,13 @@ public class HbckTestingUtil { public static HBaseFsck doFsck( Configuration conf, boolean fix, String table) throws Exception { - return doFsck(conf, fix, fix, fix, fix,fix, fix, fix, table); + return doFsck(conf, fix, fix, fix, fix,fix, fix, fix, fix, table); } public static HBaseFsck doFsck(Configuration conf, boolean fixAssignments, boolean fixMeta, boolean fixHdfsHoles, boolean fixHdfsOverlaps, boolean fixHdfsOrphans, boolean fixTableOrphans, boolean fixVersionFile, - String table) throws Exception { + boolean fixReferenceFiles, String table) throws Exception { HBaseFsck fsck = new HBaseFsck(conf, exec); fsck.connect(); fsck.setDisplayFullReport(); // i.e. -details @@ -56,6 +56,7 @@ public class HbckTestingUtil { fsck.setFixHdfsOrphans(fixHdfsOrphans); fsck.setFixTableOrphans(fixTableOrphans); fsck.setFixVersionFile(fixVersionFile); + fsck.setFixReferenceFiles(fixReferenceFiles); if (table != null) { fsck.includeTable(table); }