MAPREDUCE-6638. Do not attempt to recover progress from previous job attempts if spill encryption is enabled. (Haibo Chen via kasha)

(cherry picked from commit de7a0a92ca)
This commit is contained in:
Karthik Kambatla 2016-10-03 10:30:22 -07:00
parent 03b797a6ac
commit 876fc2e61a
2 changed files with 130 additions and 28 deletions

View File

@ -146,7 +146,6 @@ import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
import org.apache.hadoop.yarn.security.client.ClientToAMTokenSecretManager;
import org.apache.hadoop.yarn.util.Clock;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.SystemClock;
import org.apache.log4j.LogManager;
@ -1267,44 +1266,77 @@ public class MRAppMaster extends CompositeService {
}
private void processRecovery() throws IOException{
if (appAttemptID.getAttemptId() == 1) {
return; // no need to recover on the first attempt
boolean attemptRecovery = shouldAttemptRecovery();
boolean recoverySucceeded = true;
if (attemptRecovery) {
LOG.info("Attempting to recover.");
try {
parsePreviousJobHistory();
} catch (IOException e) {
LOG.warn("Unable to parse prior job history, aborting recovery", e);
recoverySucceeded = false;
}
}
if (!isFirstAttempt() && (!attemptRecovery || !recoverySucceeded)) {
amInfos.addAll(readJustAMInfos());
}
}
private boolean isFirstAttempt() {
return appAttemptID.getAttemptId() == 1;
}
/**
* Check if the current job attempt should try to recover from previous
* job attempts if any.
*/
private boolean shouldAttemptRecovery() throws IOException {
if (isFirstAttempt()) {
return false; // no need to recover on the first attempt
}
boolean recoveryEnabled = getConfig().getBoolean(
MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE,
MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE_DEFAULT);
if (!recoveryEnabled) {
LOG.info("Not attempting to recover. Recovery disabled. To enable " +
"recovery, set " + MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE);
return false;
}
boolean recoverySupportedByCommitter = isRecoverySupported();
if (!recoverySupportedByCommitter) {
LOG.info("Not attempting to recover. Recovery is not supported by " +
committer.getClass() + ". Use an OutputCommitter that supports" +
" recovery.");
return false;
}
// If a shuffle secret was not provided by the job client then this app
// attempt will generate one. However that disables recovery if there
// are reducers as the shuffle secret would be app attempt specific.
int numReduceTasks = getConfig().getInt(MRJobConfig.NUM_REDUCES, 0);
int reducerCount = getConfig().getInt(MRJobConfig.NUM_REDUCES, 0);
// If a shuffle secret was not provided by the job client, one will be
// generated in this job attempt. However, that disables recovery if
// there are reducers as the shuffle secret would be job attempt specific.
boolean shuffleKeyValidForRecovery =
TokenCache.getShuffleSecretKey(jobCredentials) != null;
if (recoveryEnabled && recoverySupportedByCommitter
&& (numReduceTasks <= 0 || shuffleKeyValidForRecovery)) {
LOG.info("Recovery is enabled. "
+ "Will try to recover from previous life on best effort basis.");
try {
parsePreviousJobHistory();
} catch (IOException e) {
LOG.warn("Unable to parse prior job history, aborting recovery", e);
// try to get just the AMInfos
amInfos.addAll(readJustAMInfos());
}
} else {
LOG.info("Will not try to recover. recoveryEnabled: "
+ recoveryEnabled + " recoverySupportedByCommitter: "
+ recoverySupportedByCommitter + " numReduceTasks: "
+ numReduceTasks + " shuffleKeyValidForRecovery: "
+ shuffleKeyValidForRecovery + " ApplicationAttemptID: "
+ appAttemptID.getAttemptId());
// Get the amInfos anyways whether recovery is enabled or not
amInfos.addAll(readJustAMInfos());
if (reducerCount > 0 && !shuffleKeyValidForRecovery) {
LOG.info("Not attempting to recover. The shuffle key is invalid for " +
"recovery.");
return false;
}
// If the intermediate data is encrypted, recovering the job requires the
// access to the key. Until the encryption key is persisted, we should
// avoid attempts to recover.
boolean spillEncrypted = CryptoUtils.isEncryptedSpillEnabled(getConfig());
if (reducerCount > 0 && spillEncrypted) {
LOG.info("Not attempting to recover. Intermediate spill encryption" +
" is enabled.");
return false;
}
return true;
}
private static FSDataInputStream getPreviousJobHistoryStream(
@ -1404,6 +1436,10 @@ public class MRAppMaster extends CompositeService {
return amInfos;
}
public boolean recovered() {
return recoveredJobStartTime > 0;
}
/**
* This can be overridden to instantiate multiple jobs and create a
* workflow.

View File

@ -579,6 +579,72 @@ public class TestRecovery {
app.verifyCompleted();
}
@Test
public void testRecoveryWithSpillEncryption() throws Exception {
int runCount = 0;
MRApp app = new MRAppWithHistory(1, 1, false, this.getClass().getName(),
true, ++runCount) {
};
Configuration conf = new Configuration();
conf.setBoolean(MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE, true);
conf.setBoolean("mapred.mapper.new-api", true);
conf.setBoolean("mapred.reducer.new-api", true);
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
conf.set(FileOutputFormat.OUTDIR, outputDir.toString());
conf.setBoolean(MRJobConfig.MR_ENCRYPTED_INTERMEDIATE_DATA, true);
// run the MR job at the first attempt
Job jobAttempt1 = app.submit(conf);
app.waitForState(jobAttempt1, JobState.RUNNING);
Iterator<Task> tasks = jobAttempt1.getTasks().values().iterator();
// finish the map task but the reduce task
Task mapper = tasks.next();
app.waitForState(mapper, TaskState.RUNNING);
TaskAttempt mapAttempt = mapper.getAttempts().values().iterator().next();
app.waitForState(mapAttempt, TaskAttemptState.RUNNING);
app.getContext().getEventHandler().handle(
new TaskAttemptEvent(mapAttempt.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(mapper, TaskState.SUCCEEDED);
// crash the first attempt of the MR job
app.stop();
// run the MR job again at the second attempt
app = new MRAppWithHistory(1, 1, false, this.getClass().getName(), false,
++runCount);
Job jobAttempt2 = app.submit(conf);
Assert.assertTrue("Recovery from previous job attempt is processed even " +
"though intermediate data encryption is enabled.", !app.recovered());
// The map task succeeded from previous job attempt will not be recovered
// because the data spill encryption is enabled.
// Let's finish the job at the second attempt and verify its completion.
app.waitForState(jobAttempt2, JobState.RUNNING);
tasks = jobAttempt2.getTasks().values().iterator();
mapper = tasks.next();
Task reducer = tasks.next();
// finish the map task first
app.waitForState(mapper, TaskState.RUNNING);
mapAttempt = mapper.getAttempts().values().iterator().next();
app.waitForState(mapAttempt, TaskAttemptState.RUNNING);
app.getContext().getEventHandler().handle(
new TaskAttemptEvent(mapAttempt.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(mapper, TaskState.SUCCEEDED);
// then finish the reduce task
TaskAttempt redAttempt = reducer.getAttempts().values().iterator().next();
app.waitForState(redAttempt, TaskAttemptState.RUNNING);
app.getContext().getEventHandler().handle(
new TaskAttemptEvent(redAttempt.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(reducer, TaskState.SUCCEEDED);
// verify that the job succeeds at the 2rd attempt
app.waitForState(jobAttempt2, JobState.SUCCEEDED);
}
/**
* This test case primarily verifies if the recovery is controlled through config
* property. In this case, recover is turned OFF. AM with 3 maps and 0 reduce.