MAPREDUCE-7029. FileOutputCommitter is slow on filesystems lacking recursive delete. Contributed by Karthik Palaniappan
This commit is contained in:
parent
fd09f72151
commit
c228a7c707
|
@ -85,6 +85,17 @@ public class FileOutputCommitter extends OutputCommitter {
|
|||
// default value to be 1 to keep consistent with previous behavior
|
||||
public static final int FILEOUTPUTCOMMITTER_FAILURE_ATTEMPTS_DEFAULT = 1;
|
||||
|
||||
// Whether tasks should delete their task temporary directories. This is
|
||||
// purely an optimization for filesystems without O(1) recursive delete, as
|
||||
// commitJob will recursively delete the entire job temporary directory.
|
||||
// HDFS has O(1) recursive delete, so this parameter is left false by default.
|
||||
// Users of object stores, for example, may want to set this to true. Note:
|
||||
// this is only used if mapreduce.fileoutputcommitter.algorithm.version=2
|
||||
public static final String FILEOUTPUTCOMMITTER_TASK_CLEANUP_ENABLED =
|
||||
"mapreduce.fileoutputcommitter.task.cleanup.enabled";
|
||||
public static final boolean
|
||||
FILEOUTPUTCOMMITTER_TASK_CLEANUP_ENABLED_DEFAULT = false;
|
||||
|
||||
private Path outputPath = null;
|
||||
private Path workPath = null;
|
||||
private final int algorithmVersion;
|
||||
|
@ -586,6 +597,17 @@ public class FileOutputCommitter extends OutputCommitter {
|
|||
mergePaths(fs, taskAttemptDirStatus, outputPath);
|
||||
LOG.info("Saved output of task '" + attemptId + "' to " +
|
||||
outputPath);
|
||||
|
||||
if (context.getConfiguration().getBoolean(
|
||||
FILEOUTPUTCOMMITTER_TASK_CLEANUP_ENABLED,
|
||||
FILEOUTPUTCOMMITTER_TASK_CLEANUP_ENABLED_DEFAULT)) {
|
||||
LOG.debug(String.format(
|
||||
"Deleting the temporary directory of '%s': '%s'",
|
||||
attemptId, taskAttemptPath));
|
||||
if(!fs.delete(taskAttemptPath, true)) {
|
||||
LOG.warn("Could not delete " + taskAttemptPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG.warn("No Output found for " + attemptId);
|
||||
|
|
|
@ -1514,6 +1514,17 @@
|
|||
</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>mapreduce.fileoutputcommitter.task.cleanup.enabled</name>
|
||||
<value>false</value>
|
||||
<description>Whether tasks should delete their task temporary directories. This is purely an
|
||||
optimization for filesystems without O(1) recursive delete, as commitJob will recursively delete
|
||||
the entire job temporary directory. HDFS has O(1) recursive delete, so this parameter is left
|
||||
false by default. Users of object stores, for example, may want to set this to true.
|
||||
|
||||
Note: this is only used if mapreduce.fileoutputcommitter.algorithm.version=2</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms</name>
|
||||
<value>1000</value>
|
||||
|
|
|
@ -249,13 +249,18 @@ public class TestFileOutputCommitter extends TestCase {
|
|||
assert(dataFileFound && indexFileFound);
|
||||
}
|
||||
|
||||
private void testCommitterInternal(int version) throws Exception {
|
||||
private void testCommitterInternal(int version, boolean taskCleanup)
|
||||
throws Exception {
|
||||
Job job = Job.getInstance();
|
||||
FileOutputFormat.setOutputPath(job, outDir);
|
||||
Configuration conf = job.getConfiguration();
|
||||
conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
|
||||
conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION,
|
||||
conf.setInt(
|
||||
FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION,
|
||||
version);
|
||||
conf.setBoolean(
|
||||
FileOutputCommitter.FILEOUTPUTCOMMITTER_TASK_CLEANUP_ENABLED,
|
||||
taskCleanup);
|
||||
JobContext jContext = new JobContextImpl(conf, taskID.getJobID());
|
||||
TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID);
|
||||
FileOutputCommitter committer = new FileOutputCommitter(outDir, tContext);
|
||||
|
@ -269,9 +274,30 @@ public class TestFileOutputCommitter extends TestCase {
|
|||
RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(tContext);
|
||||
writeOutput(theRecordWriter, tContext);
|
||||
|
||||
// check task and job temp directories exist
|
||||
File jobOutputDir = new File(
|
||||
new Path(outDir, FileOutputCommitter.PENDING_DIR_NAME).toString());
|
||||
File taskOutputDir = new File(Path.getPathWithoutSchemeAndAuthority(
|
||||
committer.getWorkPath()).toString());
|
||||
assertTrue("job temp dir does not exist", jobOutputDir.exists());
|
||||
assertTrue("task temp dir does not exist", taskOutputDir.exists());
|
||||
|
||||
// do commit
|
||||
committer.commitTask(tContext);
|
||||
assertTrue("job temp dir does not exist", jobOutputDir.exists());
|
||||
if (version == 1 || taskCleanup) {
|
||||
// Task temp dir gets renamed in v1 and deleted if taskCleanup is
|
||||
// enabled in v2
|
||||
assertFalse("task temp dir still exists", taskOutputDir.exists());
|
||||
} else {
|
||||
// By default, in v2 the task temp dir is only deleted during commitJob
|
||||
assertTrue("task temp dir does not exist", taskOutputDir.exists());
|
||||
}
|
||||
|
||||
// Entire job temp directory gets deleted, including task temp dir
|
||||
committer.commitJob(jContext);
|
||||
assertFalse("job temp dir still exists", jobOutputDir.exists());
|
||||
assertFalse("task temp dir still exists", taskOutputDir.exists());
|
||||
|
||||
// validate output
|
||||
validateContent(outDir);
|
||||
|
@ -279,13 +305,17 @@ public class TestFileOutputCommitter extends TestCase {
|
|||
}
|
||||
|
||||
public void testCommitterV1() throws Exception {
|
||||
testCommitterInternal(1);
|
||||
testCommitterInternal(1, false);
|
||||
}
|
||||
|
||||
public void testCommitterV2() throws Exception {
|
||||
testCommitterInternal(2);
|
||||
testCommitterInternal(2, false);
|
||||
}
|
||||
|
||||
|
||||
public void testCommitterV2TaskCleanupEnabled() throws Exception {
|
||||
testCommitterInternal(2, true);
|
||||
}
|
||||
|
||||
public void testCommitterWithDuplicatedCommitV1() throws Exception {
|
||||
testCommitterWithDuplicatedCommitInternal(1);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue