HBASE-7643 HFileArchiver.resolveAndArchive() race condition may lead to snapshot data loss

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1438972 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
mbertozzi 2013-01-26 21:59:20 +00:00
parent e9f14f1073
commit 48a565a739
2 changed files with 112 additions and 41 deletions

View File

@ -55,6 +55,9 @@ public class HFileArchiver {
private static final Log LOG = LogFactory.getLog(HFileArchiver.class);
private static final String SEPARATOR = ".";
/** Number of retries in case of fs operation failure */
private static final int DEFAULT_RETRIES_NUMBER = 3;
private HFileArchiver() {
// hidden ctor since this is just a util
}
@ -136,6 +139,7 @@ public class HFileArchiver {
try {
success = resolveAndArchive(fs, regionArchiveDir, toArchive);
} catch (IOException e) {
LOG.error("Failed to archive: " + toArchive, e);
success = false;
}
@ -254,14 +258,12 @@ public class HFileArchiver {
long start = EnvironmentEdgeManager.currentTimeMillis();
List<File> failures = resolveAndArchive(fs, baseArchiveDir, toArchive, start);
// clean out the failures by just deleting them
// notify that some files were not archived.
// We can't delete the files otherwise snapshots or other backup system
// that relies on the archiver end up with data loss.
if (failures.size() > 0) {
try {
LOG.error("Failed to complete archive, deleting extra store files.");
deleteFilesWithoutArchiving(failures);
} catch (IOException e) {
LOG.warn("Failed to delete store file(s) when archiving failed", e);
}
LOG.warn("Failed to complete archive of: " + failures +
". Those files are still in the original location, and they may slow down reads.");
return false;
}
return true;
@ -370,46 +372,47 @@ public class HFileArchiver {
LOG.debug("Backed up archive file from: " + archiveFile);
}
LOG.debug("No existing file in archive for:" + archiveFile + ", free to archive original file.");
LOG.debug("No existing file in archive for:" + archiveFile +
", free to archive original file.");
// at this point, we should have a free spot for the archive file
if (currentFile.moveAndClose(archiveFile)) {
boolean success = false;
for (int i = 0; !success && i < DEFAULT_RETRIES_NUMBER; ++i) {
if (i > 0) {
// Ensure that the archive directory exists.
// The previous "move to archive" operation has failed probably because
// the cleaner has removed our archive directory (HBASE-7643).
// (we're in a retry loop, so don't worry too much about the exception)
try {
if (!fs.exists(archiveDir)) {
if (fs.mkdirs(archiveDir)) {
LOG.debug("Created archive directory:" + archiveDir);
}
}
} catch (IOException e) {
LOG.warn("Failed to create the archive directory: " + archiveDir, e);
}
}
try {
success = currentFile.moveAndClose(archiveFile);
} catch (IOException e) {
LOG.warn("Failed to archive file: " + currentFile + " on try #" + i, e);
success = false;
}
}
if (!success) {
LOG.error("Failed to archive file:" + currentFile);
return false;
} else if (LOG.isDebugEnabled()) {
}
if (LOG.isDebugEnabled()) {
LOG.debug("Finished archiving file from: " + currentFile + ", to: " + archiveFile);
}
return true;
}
/**
* Simple delete of regular files from the {@link FileSystem}.
* <p>
* This method is a more generic implementation that the other deleteXXX
* methods in this class, allowing more code reuse at the cost of a couple
* more, short-lived objects (which should have minimum impact on the jvm).
* @param fs {@link FileSystem} where the files live
* @param files {@link Collection} of files to be deleted
* @throws IOException if a file cannot be deleted. All files will be
* attempted to deleted before throwing the exception, rather than
* failing at the first file.
*/
private static void deleteFilesWithoutArchiving(Collection<File> files) throws IOException {
List<IOException> errors = new ArrayList<IOException>(0);
for (File file : files) {
try {
LOG.debug("Deleting region file:" + file);
file.delete();
} catch (IOException e) {
LOG.error("Failed to delete file:" + file);
errors.add(e);
}
}
if (errors.size() > 0) {
throw MultipleIOException.createIOException(errors);
}
}
/**
* Without regard for backup, delete a region. Should be used with caution.
* @param regionDir {@link Path} to the region to be deleted.
@ -553,7 +556,7 @@ public class HFileArchiver {
public boolean moveAndClose(Path dest) throws IOException {
this.close();
Path p = this.getPath();
return !fs.rename(p, dest);
return fs.rename(p, dest);
}
/**

View File

@ -36,7 +36,11 @@ import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.MediumTests;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.backup.HFileArchiver;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.master.MasterFileSystem;
import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
import org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
@ -44,6 +48,7 @@ import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HFileArchiveTestingUtil;
import org.apache.hadoop.hbase.util.HFileArchiveUtil;
import org.apache.hadoop.hbase.util.StoppableImplementation;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Assert;
@ -319,6 +324,69 @@ public class TestHFileArchiving {
archivedFiles.containsAll(storeFiles));
}
/**
* Test HFileArchiver.resolveAndArchive() race condition HBASE-7643
*/
@Test
public void testCleaningRace() throws Exception {
final long TEST_TIME = 20 * 1000;
Configuration conf = UTIL.getMiniHBaseCluster().getMaster().getConfiguration();
Path rootDir = UTIL.getDataTestDir("testCleaningRace");
FileSystem fs = UTIL.getTestFileSystem();
Path archiveDir = new Path(rootDir, HConstants.HFILE_ARCHIVE_DIRECTORY);
Path regionDir = new Path("table", "abcdef");
Path familyDir = new Path(regionDir, "cf");
Path sourceRegionDir = new Path(rootDir, regionDir);
fs.mkdirs(sourceRegionDir);
Stoppable stoppable = new StoppableImplementation();
// The cleaner should be looping without long pauses to reproduce the race condition.
HFileCleaner cleaner = new HFileCleaner(1, stoppable, conf, fs, archiveDir);
try {
cleaner.start();
// Keep creating/archiving new files while the cleaner is running in the other thread
long startTime = System.currentTimeMillis();
for (long fid = 0; (System.currentTimeMillis() - startTime) < TEST_TIME; ++fid) {
Path file = new Path(familyDir, String.valueOf(fid));
Path sourceFile = new Path(rootDir, file);
Path archiveFile = new Path(archiveDir, file);
fs.createNewFile(sourceFile);
try {
// Try to archive the file
HFileArchiver.archiveRegion(conf, fs, rootDir,
sourceRegionDir.getParent(), sourceRegionDir);
// The archiver succeded, the file is no longer in the original location
// but it's in the archive location.
LOG.debug("hfile=" + fid + " should be in the archive");
assertTrue(fs.exists(archiveFile));
assertFalse(fs.exists(sourceFile));
} catch (IOException e) {
// The archiver is unable to archive the file. Probably HBASE-7643 race condition.
// in this case, the file should not be archived, and we should have the file
// in the original location.
LOG.debug("hfile=" + fid + " should be in the source location");
assertFalse(fs.exists(archiveFile));
assertTrue(fs.exists(sourceFile));
// Avoid to have this file in the next run
fs.delete(sourceFile, false);
}
}
} finally {
stoppable.stop("test end");
cleaner.join();
fs.delete(rootDir, true);
}
}
private void clearArchiveDirectory() throws IOException {
UTIL.getTestFileSystem().delete(new Path(UTIL.getDefaultRootDirPath(), ".archive"), true);
}