HBASE-16775 Fix flaky TestExportSnapshot#testExportRetry.

Reason for flakyness: Current test is probability based fault injection and triggers failure 3% of the time. Earlier when test used LocalJobRunner which didn't honor "mapreduce.map.maxattempts", it'd pass 97% time (when no fault is injected) and fail 3% time (when fault was injected). Point being, even when the test was complete wrong, we couldn't catch it because it was probability based.

This change will inject fault in a deterministic manner.
On design side, it encapsulates all testing hooks in ExportSnapshot.java into single inner class.

Change-Id: Icba866e1d56a5281748df89f4dd374bc45bad249
This commit is contained in:
Apekshit Sharma 2016-10-06 14:20:58 -07:00
parent cf3215d343
commit da5fb27eab
6 changed files with 97 additions and 76 deletions

View File

@ -29,7 +29,6 @@ import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
@ -110,9 +109,12 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool {
private static final String CONF_BANDWIDTH_MB = "snapshot.export.map.bandwidth.mb";
protected static final String CONF_SKIP_TMP = "snapshot.export.skip.tmp";
static final String CONF_TEST_FAILURE = "test.snapshot.export.failure";
static final String CONF_TEST_RETRY = "test.snapshot.export.failure.retry";
static class Testing {
static final String CONF_TEST_FAILURE = "test.snapshot.export.failure";
static final String CONF_TEST_FAILURE_COUNT = "test.snapshot.export.failure.count";
int failuresCountToInject = 0;
int injectedFailureCount = 0;
}
// Command line options and defaults.
static final class Options {
@ -149,12 +151,10 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool {
private static class ExportMapper extends Mapper<BytesWritable, NullWritable,
NullWritable, NullWritable> {
private static final Log LOG = LogFactory.getLog(ExportMapper.class);
final static int REPORT_SIZE = 1 * 1024 * 1024;
final static int BUFFER_SIZE = 64 * 1024;
private boolean testFailures;
private Random random;
private boolean verifyChecksum;
private String filesGroup;
private String filesUser;
@ -169,9 +169,12 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool {
private Path inputArchive;
private Path inputRoot;
private static Testing testing = new Testing();
@Override
public void setup(Context context) throws IOException {
Configuration conf = context.getConfiguration();
Configuration srcConf = HBaseConfiguration.createClusterConf(conf, null, CONF_SOURCE_PREFIX);
Configuration destConf = HBaseConfiguration.createClusterConf(conf, null, CONF_DEST_PREFIX);
@ -186,8 +189,6 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool {
inputArchive = new Path(inputRoot, HConstants.HFILE_ARCHIVE_DIRECTORY);
outputArchive = new Path(outputRoot, HConstants.HFILE_ARCHIVE_DIRECTORY);
testFailures = conf.getBoolean(CONF_TEST_FAILURE, false);
try {
srcConf.setBoolean("fs." + inputRoot.toUri().getScheme() + ".impl.disable.cache", true);
inputFs = FileSystem.get(inputRoot.toUri(), srcConf);
@ -210,6 +211,12 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool {
for (Counter c : Counter.values()) {
context.getCounter(c).increment(0);
}
if (context.getConfiguration().getBoolean(Testing.CONF_TEST_FAILURE, false)) {
testing.failuresCountToInject = conf.getInt(Testing.CONF_TEST_FAILURE_COUNT, 0);
// Get number of times we have already injected failure based on attempt number of this
// task.
testing.injectedFailureCount = context.getTaskAttemptID().getId();
}
}
@Override
@ -251,35 +258,23 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool {
return new Path(outputArchive, path);
}
/*
* Used by TestExportSnapshot to simulate a failure
/**
* Used by TestExportSnapshot to test for retries when failures happen.
* Failure is injected in {@link #copyFile(Context, SnapshotFileInfo, Path)}.
*/
private void injectTestFailure(final Context context, final SnapshotFileInfo inputInfo)
throws IOException {
if (testFailures) {
if (context.getConfiguration().getBoolean(CONF_TEST_RETRY, false)) {
if (random == null) {
random = new Random();
}
// FLAKY-TEST-WARN: lower is better, we can get some runs without the
// retry, but at least we reduce the number of test failures due to
// this test exception from the same map task.
if (random.nextFloat() < 0.03) {
throw new IOException("TEST RETRY FAILURE: Unable to copy input=" + inputInfo
+ " time=" + System.currentTimeMillis());
}
} else {
context.getCounter(Counter.COPY_FAILED).increment(1);
throw new IOException("TEST FAILURE: Unable to copy input=" + inputInfo);
}
}
if (!context.getConfiguration().getBoolean(Testing.CONF_TEST_FAILURE, false)) return;
if (testing.injectedFailureCount >= testing.failuresCountToInject) return;
testing.injectedFailureCount++;
context.getCounter(Counter.COPY_FAILED).increment(1);
LOG.debug("Injecting failure. Count: " + testing.injectedFailureCount);
throw new IOException(String.format("TEST FAILURE (%d of max %d): Unable to copy input=%s",
testing.injectedFailureCount, testing.failuresCountToInject, inputInfo));
}
private void copyFile(final Context context, final SnapshotFileInfo inputInfo,
final Path outputPath) throws IOException {
injectTestFailure(context, inputInfo);
// Get the file information
FileStatus inputStat = getSourceFileStatus(context, inputInfo);
@ -318,6 +313,7 @@ public class ExportSnapshot extends AbstractHBaseTool implements Tool {
}
} finally {
in.close();
injectTestFailure(context, inputInfo);
}
}

View File

@ -21,6 +21,7 @@ package org.apache.hadoop.hbase.snapshot;
import static org.apache.hadoop.util.ToolRunner.run;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;
import java.io.IOException;
import java.net.URI;
@ -44,12 +45,10 @@ import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.SnapshotDescription;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotFileInfo;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotRegionManifest;
import org.apache.hadoop.hbase.testclassification.VerySlowMapReduceTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.util.ToolRunner;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
@ -57,6 +56,7 @@ import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;
import org.junit.rules.TestRule;
/**
@ -72,6 +72,9 @@ public class TestExportSnapshot {
protected final static byte[] FAMILY = Bytes.toBytes("cf");
@Rule
public final TestName testName = new TestName();
protected TableName tableName;
private byte[] emptySnapshotName;
private byte[] snapshotName;
@ -85,17 +88,27 @@ public class TestExportSnapshot {
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 6);
conf.setBoolean("hbase.master.enabletable.roundrobin", true);
conf.setInt("mapreduce.map.maxattempts", 10);
// If a single node has enough failures (default 3), resource manager will blacklist it.
// With only 2 nodes and tests injecting faults, we don't want that.
conf.setInt("mapreduce.job.maxtaskfailures.per.tracker", 100);
}
@BeforeClass
public static void setUpBeforeClass() throws Exception {
setUpBaseConf(TEST_UTIL.getConfiguration());
TEST_UTIL.startMiniCluster(3);
// Setup separate test-data directory for MR cluster and set corresponding configurations.
// Otherwise, different test classes running MR cluster can step on each other.
TEST_UTIL.getDataTestDir();
TEST_UTIL.startMiniZKCluster();
TEST_UTIL.startMiniMapReduceCluster();
TEST_UTIL.startMiniHBaseCluster(1, 3);
}
@AfterClass
public static void tearDownAfterClass() throws Exception {
TEST_UTIL.shutdownMiniCluster();
TEST_UTIL.shutdownMiniHBaseCluster();
TEST_UTIL.shutdownMiniMapReduceCluster();
TEST_UTIL.shutdownMiniZKCluster();
}
/**
@ -105,10 +118,9 @@ public class TestExportSnapshot {
public void setUp() throws Exception {
this.admin = TEST_UTIL.getAdmin();
long tid = System.currentTimeMillis();
tableName = TableName.valueOf("testtb-" + tid);
snapshotName = Bytes.toBytes("snaptb0-" + tid);
emptySnapshotName = Bytes.toBytes("emptySnaptb0-" + tid);
tableName = TableName.valueOf("testtb-" + testName.getMethodName());
snapshotName = Bytes.toBytes("snaptb0-" + testName.getMethodName());
emptySnapshotName = Bytes.toBytes("emptySnaptb0-" + testName.getMethodName());
// create Table
createTable();
@ -191,16 +203,16 @@ public class TestExportSnapshot {
Path copyDir, boolean overwrite) throws Exception {
testExportFileSystemState(TEST_UTIL.getConfiguration(), tableName, snapshotName, targetName,
filesExpected, TEST_UTIL.getDefaultRootDirPath(), copyDir,
overwrite, getBypassRegionPredicate());
overwrite, getBypassRegionPredicate(), true);
}
/**
* Test ExportSnapshot
* Creates destination directory, runs ExportSnapshot() tool, and runs some verifications.
*/
protected static void testExportFileSystemState(final Configuration conf, final TableName tableName,
final byte[] snapshotName, final byte[] targetName, final int filesExpected,
final Path sourceDir, Path copyDir, final boolean overwrite,
final RegionPredicate bypassregionPredicate) throws Exception {
final RegionPredicate bypassregionPredicate, boolean success) throws Exception {
URI hdfsUri = FileSystem.get(conf).getUri();
FileSystem fs = FileSystem.get(copyDir.toUri(), new Configuration());
copyDir = copyDir.makeQualified(fs);
@ -218,7 +230,12 @@ public class TestExportSnapshot {
// Export Snapshot
int res = run(conf, new ExportSnapshot(), opts.toArray(new String[opts.size()]));
assertEquals(0, res);
assertEquals(success ? 0 : 1, res);
if (!success) {
final Path targetDir = new Path(HConstants.SNAPSHOT_DIR_NAME, Bytes.toString(targetName));
assertFalse(fs.exists(new Path(copyDir, targetDir)));
return;
}
// Verify File-System state
FileStatus[] rootFiles = fs.listStatus(copyDir);
@ -242,42 +259,35 @@ public class TestExportSnapshot {
}
/**
* Check that ExportSnapshot will return a failure if something fails.
*/
@Test
public void testExportFailure() throws Exception {
assertEquals(1, runExportAndInjectFailures(snapshotName, false));
}
/**
* Check that ExportSnapshot will succede if something fails but the retry succede.
* Check that ExportSnapshot will succeed if something fails but the retry succeed.
*/
@Test
public void testExportRetry() throws Exception {
assertEquals(0, runExportAndInjectFailures(snapshotName, true));
}
/*
* Execute the ExportSnapshot job injecting failures
*/
private int runExportAndInjectFailures(final byte[] snapshotName, boolean retry)
throws Exception {
Path copyDir = getLocalDestinationDir();
URI hdfsUri = FileSystem.get(TEST_UTIL.getConfiguration()).getUri();
FileSystem fs = FileSystem.get(copyDir.toUri(), new Configuration());
copyDir = copyDir.makeQualified(fs);
Configuration conf = new Configuration(TEST_UTIL.getConfiguration());
conf.setBoolean(ExportSnapshot.CONF_TEST_FAILURE, true);
conf.setBoolean(ExportSnapshot.CONF_TEST_RETRY, retry);
if (!retry) {
conf.setInt("mapreduce.map.maxattempts", 3);
}
// Export Snapshot
Path sourceDir = TEST_UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getRootDir();
String[] args = new String[] { "--snapshot", Bytes.toString(snapshotName),
"--copy-from", sourceDir.toString(), "--copy-to", copyDir.toString() };
return ToolRunner.run(conf, new ExportSnapshot(), args);
conf.setBoolean(ExportSnapshot.Testing.CONF_TEST_FAILURE, true);
conf.setInt(ExportSnapshot.Testing.CONF_TEST_FAILURE_COUNT, 2);
conf.setInt("mapreduce.map.maxattempts", 3);
testExportFileSystemState(conf, tableName, snapshotName, snapshotName, tableNumFiles,
TEST_UTIL.getDefaultRootDirPath(), copyDir, true, getBypassRegionPredicate(), true);
}
/**
* Check that ExportSnapshot will fail if we inject failure more times than MR will retry.
*/
@Test
public void testExportFailure() throws Exception {
Path copyDir = getLocalDestinationDir();
FileSystem fs = FileSystem.get(copyDir.toUri(), new Configuration());
copyDir = copyDir.makeQualified(fs);
Configuration conf = new Configuration(TEST_UTIL.getConfiguration());
conf.setBoolean(ExportSnapshot.Testing.CONF_TEST_FAILURE, true);
conf.setInt(ExportSnapshot.Testing.CONF_TEST_FAILURE_COUNT, 4);
conf.setInt("mapreduce.map.maxattempts", 3);
testExportFileSystemState(conf, tableName, snapshotName, snapshotName, tableNumFiles,
TEST_UTIL.getDefaultRootDirPath(), copyDir, true, getBypassRegionPredicate(), false);
}
/*

View File

@ -104,7 +104,7 @@ public class TestExportSnapshotNoCluster {
TableName tableName = builder.getTableDescriptor().getTableName();
TestExportSnapshot.testExportFileSystemState(TEST_UTIL.getConfiguration(),
tableName, snapshotName, snapshotName, snapshotFilesCount,
testDir, getDestinationDir(), false, null);
testDir, getDestinationDir(), false, null, true);
}
private Path getDestinationDir() {

View File

@ -45,7 +45,12 @@ public class TestMobExportSnapshot extends TestExportSnapshot {
@BeforeClass
public static void setUpBeforeClass() throws Exception {
setUpBaseConf(TEST_UTIL.getConfiguration());
TEST_UTIL.startMiniCluster(3);
// Setup separate test-data directory for MR cluster and set corresponding configurations.
// Otherwise, different test classes running MR cluster can step on each other.
TEST_UTIL.getDataTestDir();
TEST_UTIL.startMiniZKCluster();
TEST_UTIL.startMiniMapReduceCluster();
TEST_UTIL.startMiniHBaseCluster(1, 3);
}
@Override

View File

@ -37,6 +37,9 @@ public class TestMobSecureExportSnapshot extends TestMobExportSnapshot {
@BeforeClass
public static void setUpBeforeClass() throws Exception {
setUpBaseConf(TEST_UTIL.getConfiguration());
// Setup separate test-data directory for MR cluster and set corresponding configurations.
// Otherwise, different test classes running MR cluster can step on each other.
TEST_UTIL.getDataTestDir();
// set the always on security provider
UserProvider.setUserProviderForTesting(TEST_UTIL.getConfiguration(),
@ -45,7 +48,9 @@ public class TestMobSecureExportSnapshot extends TestMobExportSnapshot {
// setup configuration
SecureTestUtil.enableSecurity(TEST_UTIL.getConfiguration());
TEST_UTIL.startMiniCluster(3);
TEST_UTIL.startMiniZKCluster();
TEST_UTIL.startMiniMapReduceCluster();
TEST_UTIL.startMiniHBaseCluster(1, 3);
// Wait for the ACL table to become available
TEST_UTIL.waitTableEnabled(AccessControlLists.ACL_TABLE_NAME);

View File

@ -42,6 +42,9 @@ public class TestSecureExportSnapshot extends TestExportSnapshot {
@BeforeClass
public static void setUpBeforeClass() throws Exception {
setUpBaseConf(TEST_UTIL.getConfiguration());
// Setup separate test-data directory for MR cluster and set corresponding configurations.
// Otherwise, different test classes running MR cluster can step on each other.
TEST_UTIL.getDataTestDir();
// set the always on security provider
UserProvider.setUserProviderForTesting(TEST_UTIL.getConfiguration(),
@ -50,7 +53,9 @@ public class TestSecureExportSnapshot extends TestExportSnapshot {
// setup configuration
SecureTestUtil.enableSecurity(TEST_UTIL.getConfiguration());
TEST_UTIL.startMiniCluster(3);
TEST_UTIL.startMiniZKCluster();
TEST_UTIL.startMiniMapReduceCluster();
TEST_UTIL.startMiniHBaseCluster(1, 3);
// Wait for the ACL table to become available
TEST_UTIL.waitTableEnabled(AccessControlLists.ACL_TABLE_NAME);