HBASE-7199 hbck should check lingering reference hfile and have option to sideline them automatically

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1423970 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
jxiang 2012-12-19 17:29:39 +00:00
parent d4e08292b7
commit 04b3f13b38
4 changed files with 112 additions and 6 deletions

View File

@ -65,7 +65,6 @@ import org.apache.hadoop.hbase.util.BloomFilter;
import org.apache.hadoop.hbase.util.BloomFilterFactory; import org.apache.hadoop.hbase.util.BloomFilterFactory;
import org.apache.hadoop.hbase.util.BloomFilterWriter; import org.apache.hadoop.hbase.util.BloomFilterWriter;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.Writables; import org.apache.hadoop.hbase.util.Writables;
import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.RawComparator;
@ -336,7 +335,7 @@ public class StoreFile {
* @return Calculated path to parent region file. * @return Calculated path to parent region file.
* @throws IOException * @throws IOException
*/ */
static Path getReferredToFile(final Path p) { public static Path getReferredToFile(final Path p) {
Matcher m = REF_NAME_PARSER.matcher(p.getName()); Matcher m = REF_NAME_PARSER.matcher(p.getName());
if (m == null || !m.matches()) { if (m == null || !m.matches()) {
LOG.warn("Failed match of store file name " + p.toString()); LOG.warn("Failed match of store file name " + p.toString());

View File

@ -85,6 +85,7 @@ import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.master.MasterFileSystem; import org.apache.hadoop.hbase.master.MasterFileSystem;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.StoreFile;
import org.apache.hadoop.hbase.regionserver.wal.HLogUtil; import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE; import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
@ -191,6 +192,7 @@ public class HBaseFsck extends Configured implements Tool {
private boolean fixTableOrphans = false; // fix fs holes (missing .tableinfo) private boolean fixTableOrphans = false; // fix fs holes (missing .tableinfo)
private boolean fixVersionFile = false; // fix missing hbase.version file in hdfs private boolean fixVersionFile = false; // fix missing hbase.version file in hdfs
private boolean fixSplitParents = false; // fix lingering split parents private boolean fixSplitParents = false; // fix lingering split parents
private boolean fixReferenceFiles = false; // fix lingering reference store file
// limit checking/fixes to listed tables, if empty attempt to check/fix all // limit checking/fixes to listed tables, if empty attempt to check/fix all
// -ROOT- and .META. are always checked // -ROOT- and .META. are always checked
@ -442,6 +444,8 @@ public class HBaseFsck extends Configured implements Tool {
admin.setBalancerRunning(oldBalancer, false); admin.setBalancerRunning(oldBalancer, false);
} }
offlineReferenceFileRepair();
// Print table summary // Print table summary
printTableSummary(tablesInfo); printTableSummary(tablesInfo);
return errors.summarize(); return errors.summarize();
@ -597,6 +601,67 @@ public class HBaseFsck extends Configured implements Tool {
return errors.getErrorList().size(); return errors.getErrorList().size();
} }
/**
* Scan all the store file names to find any lingering reference files,
* which refer to some none-exiting files. If "fix" option is enabled,
* any lingering reference file will be sidelined if found.
* <p>
* Lingering reference file prevents a region from opening. It has to
* be fixed before a cluster can start properly.
*/
private void offlineReferenceFileRepair() throws IOException {
Configuration conf = getConf();
Path hbaseRoot = FSUtils.getRootDir(conf);
FileSystem fs = hbaseRoot.getFileSystem(conf);
Map<String, Path> allFiles = FSUtils.getTableStoreFilePathMap(fs, hbaseRoot);
for (Path path: allFiles.values()) {
boolean isReference = false;
try {
isReference = StoreFile.isReference(path);
} catch (Throwable t) {
// Ignore. Some files may not be store files at all.
// For example, files under .oldlogs folder in .META.
// Warning message is already logged by
// StoreFile#isReference.
}
if (!isReference) continue;
Path referredToFile = StoreFile.getReferredToFile(path);
if (fs.exists(referredToFile)) continue; // good, expected
// Found a lingering reference file
errors.reportError(ERROR_CODE.LINGERING_REFERENCE_HFILE,
"Found lingering reference file " + path);
if (!shouldFixReferenceFiles()) continue;
// Now, trying to fix it since requested
boolean success = false;
String pathStr = path.toString();
// A reference file path should be like
// ${hbase.rootdir}/table_name/region_id/family_name/referred_file.region_name
// Up 3 directories to get the table folder.
// So the file will be sidelined to a similar folder structure.
int index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR);
for (int i = 0; index > 0 && i < 3; i++) {
index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR, index);
}
if (index > 0) {
Path rootDir = getSidelineDir();
Path dst = new Path(rootDir, pathStr.substring(index));
fs.mkdirs(dst.getParent());
LOG.info("Trying to sildeline reference file"
+ path + " to " + dst);
setShouldRerun();
success = fs.rename(path, dst);
}
if (!success) {
LOG.error("Failed to sideline reference file " + path);
}
}
}
/** /**
* TODO -- need to add tests for this. * TODO -- need to add tests for this.
*/ */
@ -2771,7 +2836,7 @@ public class HBaseFsck extends Configured implements Tool {
MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE, MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS, FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS,
HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION, HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION,
ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, LINGERING_REFERENCE_HFILE
} }
public void clear(); public void clear();
public void report(String message); public void report(String message);
@ -3204,6 +3269,14 @@ public class HBaseFsck extends Configured implements Tool {
return fixSplitParents; return fixSplitParents;
} }
public void setFixReferenceFiles(boolean shouldFix) {
fixReferenceFiles = shouldFix;
}
boolean shouldFixReferenceFiles() {
return fixReferenceFiles;
}
public boolean shouldIgnorePreCheckPermission() { public boolean shouldIgnorePreCheckPermission() {
return ignorePreCheckPermission; return ignorePreCheckPermission;
} }
@ -3315,6 +3388,7 @@ public class HBaseFsck extends Configured implements Tool {
System.err.println(" -maxOverlapsToSideline <n> When fixing region overlaps, allow at most <n> regions to sideline per group. (n=" + DEFAULT_OVERLAPS_TO_SIDELINE +" by default)"); System.err.println(" -maxOverlapsToSideline <n> When fixing region overlaps, allow at most <n> regions to sideline per group. (n=" + DEFAULT_OVERLAPS_TO_SIDELINE +" by default)");
System.err.println(" -fixSplitParents Try to force offline split parents to be online."); System.err.println(" -fixSplitParents Try to force offline split parents to be online.");
System.err.println(" -ignorePreCheckPermission ignore filesystem permission pre-check"); System.err.println(" -ignorePreCheckPermission ignore filesystem permission pre-check");
System.err.println(" -fixReferenceFiles Try to offline lingering reference store files");
System.err.println(""); System.err.println("");
System.err.println(" Datafile Repair options: (expert features, use with caution!)"); System.err.println(" Datafile Repair options: (expert features, use with caution!)");
@ -3324,7 +3398,7 @@ public class HBaseFsck extends Configured implements Tool {
System.err.println(""); System.err.println("");
System.err.println(" Metadata Repair shortcuts"); System.err.println(" Metadata Repair shortcuts");
System.err.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " + System.err.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " +
"-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps"); "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps -fixReferenceFiles");
System.err.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles"); System.err.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles");
setRetCode(-2); setRetCode(-2);
@ -3431,6 +3505,8 @@ public class HBaseFsck extends Configured implements Tool {
checkCorruptHFiles = true; checkCorruptHFiles = true;
} else if (cmd.equals("-sidelineCorruptHFiles")) { } else if (cmd.equals("-sidelineCorruptHFiles")) {
sidelineCorruptHFiles = true; sidelineCorruptHFiles = true;
} else if (cmd.equals("-fixReferenceFiles")) {
setFixReferenceFiles(true);
} else if (cmd.equals("-repair")) { } else if (cmd.equals("-repair")) {
// this attempts to merge overlapping hdfs regions, needs testing // this attempts to merge overlapping hdfs regions, needs testing
// under load // under load
@ -3443,6 +3519,7 @@ public class HBaseFsck extends Configured implements Tool {
setSidelineBigOverlaps(true); setSidelineBigOverlaps(true);
setFixSplitParents(false); setFixSplitParents(false);
setCheckHdfs(true); setCheckHdfs(true);
setFixReferenceFiles(true);
} else if (cmd.equals("-repairHoles")) { } else if (cmd.equals("-repairHoles")) {
// this will make all missing hdfs regions available but may lose data // this will make all missing hdfs regions available but may lose data
setFixHdfsHoles(true); setFixHdfsHoles(true);

View File

@ -1663,6 +1663,35 @@ public class TestHBaseFsck {
doQuarantineTest(table, hbck, 3, 0, 0, 0, 1); doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
} }
/**
* Test fixing lingering reference file.
*/
@Test
public void testLingeringReferenceFile() throws Exception {
String table = "testLingeringReferenceFile";
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by creating a fake reference file
FileSystem fs = FileSystem.get(conf);
Path tableDir= FSUtils.getTablePath(FSUtils.getRootDir(conf), table);
Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
Path famDir = new Path(regionDir, FAM_STR);
Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
fs.create(fakeReferenceFile);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
// fix reference file
doFsck(conf, true);
// check that reference file fixed
assertNoErrors(doFsck(conf, false));
} finally {
deleteTable(table);
}
}
/** /**
* Test pluggable error reporter. It can be plugged in * Test pluggable error reporter. It can be plugged in
* from system property or configuration. * from system property or configuration.

View File

@ -38,13 +38,13 @@ public class HbckTestingUtil {
public static HBaseFsck doFsck( public static HBaseFsck doFsck(
Configuration conf, boolean fix, String table) throws Exception { Configuration conf, boolean fix, String table) throws Exception {
return doFsck(conf, fix, fix, fix, fix,fix, fix, fix, table); return doFsck(conf, fix, fix, fix, fix,fix, fix, fix, fix, table);
} }
public static HBaseFsck doFsck(Configuration conf, boolean fixAssignments, public static HBaseFsck doFsck(Configuration conf, boolean fixAssignments,
boolean fixMeta, boolean fixHdfsHoles, boolean fixHdfsOverlaps, boolean fixMeta, boolean fixHdfsHoles, boolean fixHdfsOverlaps,
boolean fixHdfsOrphans, boolean fixTableOrphans, boolean fixVersionFile, boolean fixHdfsOrphans, boolean fixTableOrphans, boolean fixVersionFile,
String table) throws Exception { boolean fixReferenceFiles, String table) throws Exception {
HBaseFsck fsck = new HBaseFsck(conf, exec); HBaseFsck fsck = new HBaseFsck(conf, exec);
fsck.connect(); fsck.connect();
fsck.setDisplayFullReport(); // i.e. -details fsck.setDisplayFullReport(); // i.e. -details
@ -56,6 +56,7 @@ public class HbckTestingUtil {
fsck.setFixHdfsOrphans(fixHdfsOrphans); fsck.setFixHdfsOrphans(fixHdfsOrphans);
fsck.setFixTableOrphans(fixTableOrphans); fsck.setFixTableOrphans(fixTableOrphans);
fsck.setFixVersionFile(fixVersionFile); fsck.setFixVersionFile(fixVersionFile);
fsck.setFixReferenceFiles(fixReferenceFiles);
if (table != null) { if (table != null) {
fsck.includeTable(table); fsck.includeTable(table);
} }