HBASE-7643 HFileArchiver.resolveAndArchive() race condition may lead to snapshot data loss
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1438972 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e9f14f1073
commit
48a565a739
|
@ -55,6 +55,9 @@ public class HFileArchiver {
|
|||
private static final Log LOG = LogFactory.getLog(HFileArchiver.class);
|
||||
private static final String SEPARATOR = ".";
|
||||
|
||||
/** Number of retries in case of fs operation failure */
|
||||
private static final int DEFAULT_RETRIES_NUMBER = 3;
|
||||
|
||||
private HFileArchiver() {
|
||||
// hidden ctor since this is just a util
|
||||
}
|
||||
|
@ -136,6 +139,7 @@ public class HFileArchiver {
|
|||
try {
|
||||
success = resolveAndArchive(fs, regionArchiveDir, toArchive);
|
||||
} catch (IOException e) {
|
||||
LOG.error("Failed to archive: " + toArchive, e);
|
||||
success = false;
|
||||
}
|
||||
|
||||
|
@ -146,7 +150,7 @@ public class HFileArchiver {
|
|||
}
|
||||
|
||||
throw new IOException("Received error when attempting to archive files (" + toArchive
|
||||
+ "), cannot delete region directory.");
|
||||
+ "), cannot delete region directory. ");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -254,14 +258,12 @@ public class HFileArchiver {
|
|||
long start = EnvironmentEdgeManager.currentTimeMillis();
|
||||
List<File> failures = resolveAndArchive(fs, baseArchiveDir, toArchive, start);
|
||||
|
||||
// clean out the failures by just deleting them
|
||||
// notify that some files were not archived.
|
||||
// We can't delete the files otherwise snapshots or other backup system
|
||||
// that relies on the archiver end up with data loss.
|
||||
if (failures.size() > 0) {
|
||||
try {
|
||||
LOG.error("Failed to complete archive, deleting extra store files.");
|
||||
deleteFilesWithoutArchiving(failures);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Failed to delete store file(s) when archiving failed", e);
|
||||
}
|
||||
LOG.warn("Failed to complete archive of: " + failures +
|
||||
". Those files are still in the original location, and they may slow down reads.");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -364,52 +366,53 @@ public class HFileArchiver {
|
|||
if (!fs.delete(archiveFile, false)) {
|
||||
throw new IOException("Couldn't delete existing archive file (" + archiveFile
|
||||
+ ") or rename it to the backup file (" + backedupArchiveFile
|
||||
+ ")to make room for similarly named file.");
|
||||
+ ") to make room for similarly named file.");
|
||||
}
|
||||
}
|
||||
LOG.debug("Backed up archive file from: " + archiveFile);
|
||||
}
|
||||
|
||||
LOG.debug("No existing file in archive for:" + archiveFile + ", free to archive original file.");
|
||||
LOG.debug("No existing file in archive for:" + archiveFile +
|
||||
", free to archive original file.");
|
||||
|
||||
// at this point, we should have a free spot for the archive file
|
||||
if (currentFile.moveAndClose(archiveFile)) {
|
||||
boolean success = false;
|
||||
for (int i = 0; !success && i < DEFAULT_RETRIES_NUMBER; ++i) {
|
||||
if (i > 0) {
|
||||
// Ensure that the archive directory exists.
|
||||
// The previous "move to archive" operation has failed probably because
|
||||
// the cleaner has removed our archive directory (HBASE-7643).
|
||||
// (we're in a retry loop, so don't worry too much about the exception)
|
||||
try {
|
||||
if (!fs.exists(archiveDir)) {
|
||||
if (fs.mkdirs(archiveDir)) {
|
||||
LOG.debug("Created archive directory:" + archiveDir);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Failed to create the archive directory: " + archiveDir, e);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
success = currentFile.moveAndClose(archiveFile);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Failed to archive file: " + currentFile + " on try #" + i, e);
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!success) {
|
||||
LOG.error("Failed to archive file:" + currentFile);
|
||||
return false;
|
||||
} else if (LOG.isDebugEnabled()) {
|
||||
}
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Finished archiving file from: " + currentFile + ", to: " + archiveFile);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple delete of regular files from the {@link FileSystem}.
|
||||
* <p>
|
||||
* This method is a more generic implementation that the other deleteXXX
|
||||
* methods in this class, allowing more code reuse at the cost of a couple
|
||||
* more, short-lived objects (which should have minimum impact on the jvm).
|
||||
* @param fs {@link FileSystem} where the files live
|
||||
* @param files {@link Collection} of files to be deleted
|
||||
* @throws IOException if a file cannot be deleted. All files will be
|
||||
* attempted to deleted before throwing the exception, rather than
|
||||
* failing at the first file.
|
||||
*/
|
||||
private static void deleteFilesWithoutArchiving(Collection<File> files) throws IOException {
|
||||
List<IOException> errors = new ArrayList<IOException>(0);
|
||||
for (File file : files) {
|
||||
try {
|
||||
LOG.debug("Deleting region file:" + file);
|
||||
file.delete();
|
||||
} catch (IOException e) {
|
||||
LOG.error("Failed to delete file:" + file);
|
||||
errors.add(e);
|
||||
}
|
||||
}
|
||||
if (errors.size() > 0) {
|
||||
throw MultipleIOException.createIOException(errors);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Without regard for backup, delete a region. Should be used with caution.
|
||||
* @param regionDir {@link Path} to the region to be deleted.
|
||||
|
@ -553,7 +556,7 @@ public class HFileArchiver {
|
|||
public boolean moveAndClose(Path dest) throws IOException {
|
||||
this.close();
|
||||
Path p = this.getPath();
|
||||
return !fs.rename(p, dest);
|
||||
return fs.rename(p, dest);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -36,7 +36,11 @@ import org.apache.hadoop.fs.PathFilter;
|
|||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.MediumTests;
|
||||
import org.apache.hadoop.hbase.Stoppable;
|
||||
import org.apache.hadoop.hbase.backup.HFileArchiver;
|
||||
import org.apache.hadoop.hbase.client.HBaseAdmin;
|
||||
import org.apache.hadoop.hbase.master.MasterFileSystem;
|
||||
import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
|
||||
import org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegion;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
|
@ -44,6 +48,7 @@ import org.apache.hadoop.hbase.util.Bytes;
|
|||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.hbase.util.HFileArchiveTestingUtil;
|
||||
import org.apache.hadoop.hbase.util.HFileArchiveUtil;
|
||||
import org.apache.hadoop.hbase.util.StoppableImplementation;
|
||||
import org.junit.After;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.Assert;
|
||||
|
@ -319,6 +324,69 @@ public class TestHFileArchiving {
|
|||
archivedFiles.containsAll(storeFiles));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test HFileArchiver.resolveAndArchive() race condition HBASE-7643
|
||||
*/
|
||||
@Test
|
||||
public void testCleaningRace() throws Exception {
|
||||
final long TEST_TIME = 20 * 1000;
|
||||
|
||||
Configuration conf = UTIL.getMiniHBaseCluster().getMaster().getConfiguration();
|
||||
Path rootDir = UTIL.getDataTestDir("testCleaningRace");
|
||||
FileSystem fs = UTIL.getTestFileSystem();
|
||||
|
||||
Path archiveDir = new Path(rootDir, HConstants.HFILE_ARCHIVE_DIRECTORY);
|
||||
Path regionDir = new Path("table", "abcdef");
|
||||
Path familyDir = new Path(regionDir, "cf");
|
||||
|
||||
Path sourceRegionDir = new Path(rootDir, regionDir);
|
||||
fs.mkdirs(sourceRegionDir);
|
||||
|
||||
Stoppable stoppable = new StoppableImplementation();
|
||||
|
||||
// The cleaner should be looping without long pauses to reproduce the race condition.
|
||||
HFileCleaner cleaner = new HFileCleaner(1, stoppable, conf, fs, archiveDir);
|
||||
try {
|
||||
cleaner.start();
|
||||
|
||||
// Keep creating/archiving new files while the cleaner is running in the other thread
|
||||
long startTime = System.currentTimeMillis();
|
||||
for (long fid = 0; (System.currentTimeMillis() - startTime) < TEST_TIME; ++fid) {
|
||||
Path file = new Path(familyDir, String.valueOf(fid));
|
||||
Path sourceFile = new Path(rootDir, file);
|
||||
Path archiveFile = new Path(archiveDir, file);
|
||||
|
||||
fs.createNewFile(sourceFile);
|
||||
|
||||
try {
|
||||
// Try to archive the file
|
||||
HFileArchiver.archiveRegion(conf, fs, rootDir,
|
||||
sourceRegionDir.getParent(), sourceRegionDir);
|
||||
|
||||
// The archiver succeded, the file is no longer in the original location
|
||||
// but it's in the archive location.
|
||||
LOG.debug("hfile=" + fid + " should be in the archive");
|
||||
assertTrue(fs.exists(archiveFile));
|
||||
assertFalse(fs.exists(sourceFile));
|
||||
} catch (IOException e) {
|
||||
// The archiver is unable to archive the file. Probably HBASE-7643 race condition.
|
||||
// in this case, the file should not be archived, and we should have the file
|
||||
// in the original location.
|
||||
LOG.debug("hfile=" + fid + " should be in the source location");
|
||||
assertFalse(fs.exists(archiveFile));
|
||||
assertTrue(fs.exists(sourceFile));
|
||||
|
||||
// Avoid to have this file in the next run
|
||||
fs.delete(sourceFile, false);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
stoppable.stop("test end");
|
||||
cleaner.join();
|
||||
fs.delete(rootDir, true);
|
||||
}
|
||||
}
|
||||
|
||||
private void clearArchiveDirectory() throws IOException {
|
||||
UTIL.getTestFileSystem().delete(new Path(UTIL.getDefaultRootDirPath(), ".archive"), true);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue