From 3909b7c96fc36e9fddef90fa7ade9ed61f19826d Mon Sep 17 00:00:00 2001 From: chenheng Date: Mon, 22 Aug 2016 16:45:20 +0800 Subject: [PATCH] HBASE-16464 archive folder grows bigger and bigger due to corrupt snapshot under tmp dir --- .../master/snapshot/SnapshotFileCache.java | 17 +++++++++++++++- .../master/snapshot/TakeSnapshotHandler.java | 2 +- .../snapshot/SnapshotDescriptionUtils.java | 14 +++++++++++++ .../hbase/snapshot/SnapshotManifestV2.java | 18 +++++++++++------ .../snapshot/TestSnapshotHFileCleaner.java | 20 +++++++++++++++++++ .../hbase/snapshot/SnapshotTestingUtils.java | 12 +++++++++++ 6 files changed, 75 insertions(+), 8 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotFileCache.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotFileCache.java index dfd3cb5e7b1..5b367c501ee 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotFileCache.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/SnapshotFileCache.java @@ -39,6 +39,7 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Stoppable; +import org.apache.hadoop.hbase.snapshot.CorruptedSnapshotException; import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils; import org.apache.hadoop.hbase.util.FSUtils; @@ -300,7 +301,21 @@ public class SnapshotFileCache implements Stoppable { FileStatus[] running = FSUtils.listStatus(fs, snapshotTmpDir); if (running != null) { for (FileStatus run : running) { - snapshotInProgress.addAll(fileInspector.filesUnderSnapshot(run.getPath())); + try { + snapshotInProgress.addAll(fileInspector.filesUnderSnapshot(run.getPath())); + } catch (CorruptedSnapshotException e) { + // See HBASE-16464 + if (e.getCause() instanceof FileNotFoundException) { + // If the snapshot is not in progress, we will delete it + if (!fs.exists(new Path(run.getPath(), + SnapshotDescriptionUtils.SNAPSHOT_IN_PROGRESS))) { + fs.delete(run.getPath(), true); + LOG.warn("delete the " + run.getPath() + " due to exception:", e.getCause()); + } + } else { + throw e; + } + } } } return snapshotInProgress; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/TakeSnapshotHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/TakeSnapshotHandler.java index 9172e06cf5f..8967a7093ed 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/TakeSnapshotHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/TakeSnapshotHandler.java @@ -164,7 +164,7 @@ public abstract class TakeSnapshotHandler extends EventHandler implements Snapsh try { // If regions move after this meta scan, the region specific snapshot should fail, triggering // an external exception that gets captured here. - + SnapshotDescriptionUtils.createInProgressTag(workingDir, fs); // write down the snapshot info in the working directory SnapshotDescriptionUtils.writeSnapshotInfo(snapshot, workingDir, fs); snapshotManifest.addTableDescriptor(this.htd); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotDescriptionUtils.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotDescriptionUtils.java index 560a42be349..2fd619d13e1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotDescriptionUtils.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotDescriptionUtils.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hbase.snapshot; +import java.io.FileNotFoundException; import java.io.IOException; import java.util.Collections; @@ -104,6 +105,9 @@ public final class SnapshotDescriptionUtils { /** Temporary directory under the snapshot directory to store in-progress snapshots */ public static final String SNAPSHOT_TMP_DIR_NAME = ".tmp"; + + /** This tag will be created in in-progess snapshots */ + public static final String SNAPSHOT_IN_PROGRESS = ".inprogress"; // snapshot operation values /** Default value if no start time is specified */ public static final long NO_SNAPSHOT_START_TIME_SPECIFIED = 0; @@ -292,6 +296,16 @@ public final class SnapshotDescriptionUtils { } } + /** + * Create in-progress tag under .tmp of in-progress snapshot + * */ + public static void createInProgressTag(Path workingDir, FileSystem fs) throws IOException { + FsPermission perms = FSUtils.getFilePermissions(fs, fs.getConf(), + HConstants.DATA_FILE_UMASK_KEY); + Path snapshot_in_progress = new Path(workingDir, SnapshotDescriptionUtils.SNAPSHOT_IN_PROGRESS); + FSUtils.create(fs, snapshot_in_progress, perms, true); + } + /** * Read in the {@link org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription} stored for the snapshot in the passed directory * @param fs filesystem where the snapshot was taken diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotManifestV2.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotManifestV2.java index 3bb3575f3f6..df5dcd3f863 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotManifestV2.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/snapshot/SnapshotManifestV2.java @@ -82,12 +82,18 @@ public final class SnapshotManifestV2 { } public void regionClose(final SnapshotRegionManifest.Builder region) throws IOException { - SnapshotRegionManifest manifest = region.build(); - FSDataOutputStream stream = fs.create(getRegionManifestPath(snapshotDir, manifest)); - try { - manifest.writeTo(stream); - } finally { - stream.close(); + // we should ensure the snapshot dir exist, maybe it has been deleted by master + // see HBASE-16464 + if (fs.exists(snapshotDir)) { + SnapshotRegionManifest manifest = region.build(); + FSDataOutputStream stream = fs.create(getRegionManifestPath(snapshotDir, manifest)); + try { + manifest.writeTo(stream); + } finally { + stream.close(); + } + } else { + LOG.warn("can't write manifest without parent dir, maybe it has been deleted by master?"); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotHFileCleaner.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotHFileCleaner.java index b4355baf6a0..88ca5a08fb8 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotHFileCleaner.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/snapshot/TestSnapshotHFileCleaner.java @@ -168,4 +168,24 @@ public class TestSnapshotHFileCleaner { fs.delete(SnapshotDescriptionUtils.getWorkingSnapshotDir(rootDir), true); } } + + + /** + * HBASE-16464 + */ + @Test + public void testMissedTmpSnapshot() throws IOException { + SnapshotTestingUtils.SnapshotMock + snapshotMock = new SnapshotTestingUtils.SnapshotMock(TEST_UTIL.getConfiguration(), fs, rootDir); + SnapshotTestingUtils.SnapshotMock.SnapshotBuilder builder = snapshotMock.createSnapshotV2( + SNAPSHOT_NAME_STR, TABLE_NAME_STR); + builder.addRegionV2(); + builder.missOneRegionSnapshotFile(); + + long period = Long.MAX_VALUE; + SnapshotFileCache cache = new SnapshotFileCache(fs, rootDir, period, 10000000, + "test-snapshot-file-cache-refresh", new SnapshotFiles()); + cache.getSnapshotsInProgress(); + assertFalse(fs.exists(builder.getSnapshotsDir())); + } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/SnapshotTestingUtils.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/SnapshotTestingUtils.java index 38afc3bae1c..770bb009da4 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/SnapshotTestingUtils.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/snapshot/SnapshotTestingUtils.java @@ -583,6 +583,18 @@ public final class SnapshotTestingUtils { corruptFile(p); } + public void missOneRegionSnapshotFile() throws IOException { + FileStatus[] manifestFiles = FSUtils.listStatus(fs, snapshotDir); + for (FileStatus fileStatus : manifestFiles) { + String fileName = fileStatus.getPath().getName(); + if (fileName.endsWith(SnapshotDescriptionUtils.SNAPSHOTINFO_FILE) + || fileName.endsWith(".tabledesc") + || fileName.endsWith(SnapshotDescriptionUtils.SNAPSHOT_TMP_DIR_NAME)) { + fs.delete(fileStatus.getPath(), true); + } + } + } + /** * Corrupt data-manifest file *