MAPREDUCE-4879. TeraOutputFormat may overwrite an existing output directory. (gera)

This commit is contained in:
Gera Shegalov 2014-12-13 17:48:42 -08:00
parent cbfb996fb4
commit 25a0440238
4 changed files with 59 additions and 17 deletions

View File

@ -273,6 +273,9 @@ Release 2.7.0 - UNRELEASED
MAPREDUCE-6160. Potential NullPointerException in MRClientProtocol MAPREDUCE-6160. Potential NullPointerException in MRClientProtocol
interface implementation. (Rohith via jlowe) interface implementation. (Rohith via jlowe)
MAPREDUCE-4879. TeraOutputFormat may overwrite an existing output
directory. (gera)
Release 2.6.0 - 2014-11-18 Release 2.6.0 - 2014-11-18
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -289,10 +289,6 @@ public class TeraGen extends Configured implements Tool {
} }
setNumberOfRows(job, parseHumanLong(args[0])); setNumberOfRows(job, parseHumanLong(args[0]));
Path outputDir = new Path(args[1]); Path outputDir = new Path(args[1]);
if (outputDir.getFileSystem(getConf()).exists(outputDir)) {
throw new IOException("Output directory " + outputDir +
" already exists.");
}
FileOutputFormat.setOutputPath(job, outputDir); FileOutputFormat.setOutputPath(job, outputDir);
job.setJobName("TeraGen"); job.setJobName("TeraGen");
job.setJarByClass(TeraGen.class); job.setJarByClass(TeraGen.class);

View File

@ -20,10 +20,13 @@ package org.apache.hadoop.examples.terasort;
import java.io.IOException; import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapred.InvalidJobConfException; import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.OutputCommitter;
@ -87,9 +90,31 @@ public class TeraOutputFormat extends FileOutputFormat<Text,Text> {
throw new InvalidJobConfException("Output directory not set in JobConf."); throw new InvalidJobConfException("Output directory not set in JobConf.");
} }
final Configuration jobConf = job.getConfiguration();
// get delegation token for outDir's file system // get delegation token for outDir's file system
TokenCache.obtainTokensForNamenodes(job.getCredentials(), TokenCache.obtainTokensForNamenodes(job.getCredentials(),
new Path[] { outDir }, job.getConfiguration()); new Path[] { outDir }, jobConf);
final FileSystem fs = outDir.getFileSystem(jobConf);
if (fs.exists(outDir)) {
// existing output dir is considered empty iff its only content is the
// partition file.
//
final FileStatus[] outDirKids = fs.listStatus(outDir);
boolean empty = false;
if (outDirKids != null && outDirKids.length == 1) {
final FileStatus st = outDirKids[0];
final String fname = st.getPath().getName();
empty =
!st.isDirectory() && TeraInputFormat.PARTITION_FILENAME.equals(fname);
}
if (TeraSort.getUseSimplePartitioner(job) || !empty) {
throw new FileAlreadyExistsException("Output directory " + outDir
+ " already exists");
}
}
} }
public RecordWriter<Text,Text> getRecordWriter(TaskAttemptContext job public RecordWriter<Text,Text> getRecordWriter(TaskAttemptContext job

View File

@ -20,17 +20,19 @@ package org.apache.hadoop.examples.terasort;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapred.HadoopTestCase; import org.apache.hadoop.mapred.HadoopTestCase;
import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.util.ToolRunner;
import org.junit.Ignore;
@Ignore
public class TestTeraSort extends HadoopTestCase { public class TestTeraSort extends HadoopTestCase {
private static Log LOG = LogFactory.getLog(TestTeraSort.class);
public TestTeraSort() public TestTeraSort()
throws IOException { throws IOException {
super(CLUSTER_MR, DFS_FS, 1, 1); super(LOCAL_MR, LOCAL_FS, 1, 1);
} }
protected void tearDown() throws Exception { protected void tearDown() throws Exception {
@ -78,9 +80,25 @@ public class TestTeraSort extends HadoopTestCase {
// Run TeraGen to generate input for 'terasort' // Run TeraGen to generate input for 'terasort'
runTeraGen(createJobConf(), SORT_INPUT_PATH); runTeraGen(createJobConf(), SORT_INPUT_PATH);
// Run teragen again to check for FAE
try {
runTeraGen(createJobConf(), SORT_INPUT_PATH);
fail("Teragen output overwritten!");
} catch (FileAlreadyExistsException fae) {
LOG.info("Expected exception: ", fae);
}
// Run terasort // Run terasort
runTeraSort(createJobConf(), SORT_INPUT_PATH, SORT_OUTPUT_PATH); runTeraSort(createJobConf(), SORT_INPUT_PATH, SORT_OUTPUT_PATH);
// Run terasort again to check for FAE
try {
runTeraSort(createJobConf(), SORT_INPUT_PATH, SORT_OUTPUT_PATH);
fail("Terasort output overwritten!");
} catch (FileAlreadyExistsException fae) {
LOG.info("Expected exception: ", fae);
}
// Run tera-validator to check if sort worked correctly // Run tera-validator to check if sort worked correctly
runTeraValidator(createJobConf(), SORT_OUTPUT_PATH, runTeraValidator(createJobConf(), SORT_OUTPUT_PATH,
TERA_OUTPUT_PATH); TERA_OUTPUT_PATH);