diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index 6ac67851092..9070c8c8c77 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -130,6 +130,8 @@ Release 0.23.1 - Unreleased MAPREDUCE-3710. Improved FileInputFormat to return better locality for the last split. (Siddarth Seth via vinodkv) + MAPREDUCE-2765. DistCp Rewrite. (Mithun Radhakrishnan via mahadev) + OPTIMIZATIONS MAPREDUCE-3567. Extraneous JobConf objects in AM heap. (Vinod Kumar diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml index b90f99cba3e..5fdeed8c6a6 100644 --- a/hadoop-project/pom.xml +++ b/hadoop-project/pom.xml @@ -714,11 +714,21 @@ maven-project-info-reports-plugin 2.4 + + org.apache.maven.plugins + maven-resources-plugin + 2.2 + org.codehaus.mojo exec-maven-plugin 1.2 + + org.apache.maven.plugins + maven-pdf-plugin + 1.1 + @@ -816,6 +826,14 @@ + + org.apache.maven.plugins + maven-pdf-plugin + + ${project.reporting.outputDirectory} + false + + diff --git a/hadoop-tools/hadoop-distcp/README b/hadoop-tools/hadoop-distcp/README new file mode 100644 index 00000000000..14008df8af1 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/README @@ -0,0 +1,7 @@ +DistCp (distributed copy) is a tool used for large inter/intra-cluster copying. +It uses Map/Reduce to effect its distribution, error handling and recovery, +and reporting. It expands a list of files and directories into input to map tasks, +each of which will copy a partition of the files specified in the source list. + +Version 0.1 (2010/08/02 sriksun) + - Initial Version diff --git a/hadoop-tools/hadoop-distcp/pom.xml b/hadoop-tools/hadoop-distcp/pom.xml new file mode 100644 index 00000000000..7397ebeca89 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/pom.xml @@ -0,0 +1,185 @@ + + + 4.0.0 + + org.apache.hadoop + hadoop-project + 0.23.1-SNAPSHOT + ../../hadoop-project + + org.apache.hadoop.tools + hadoop-distcp + 0.23.1-SNAPSHOT + Apache Hadoop Distributed Copy + Apache Hadoop Distributed Copy + jar + + + UTF-8 + true + UTF-8 + + + + + org.apache.hadoop + hadoop-common + provided + + + org.apache.hadoop + hadoop-annotations + provided + + + org.apache.hadoop + hadoop-mapreduce-client-app + test + + + org.apache.hadoop + hadoop-mapreduce-client-hs + test + + + org.apache.hadoop + hadoop-mapreduce-client-core + provided + + + org.apache.hadoop + hadoop-mapreduce-client-jobclient + provided + + + org.apache.hadoop + hadoop-mapreduce-client-jobclient + test + test-jar + + + org.apache.hadoop + hadoop-hdfs + provided + + + org.apache.hadoop + hadoop-hdfs + test + test-jar + + + org.apache.hadoop + hadoop-common + test + test-jar + + + + + + + src/main/resources + true + + + + + src/test/resources + true + + + + + org.apache.maven.plugins + maven-surefire-plugin + + always + 600 + -Xmx1024m + + **/Test*.java + + true + + + test.build.data + ${basedir}/target/test/data + + + hadoop.log.dir + target/test/logs + + + org.apache.commons.logging.Log + org.apache.commons.logging.impl.SimpleLog + + + org.apache.commons.logging.simplelog.defaultlog + warn + + + + + + maven-dependency-plugin + + + package + + copy-dependencies + + + ${project.build.directory}/lib + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + + true + + + + org.apache.maven.plugins + maven-jar-plugin + + + + org.apache.hadoop.tools.DistCp + + + + + + org.apache.maven.plugins + maven-source-plugin + + true + + + + + jar + + + + + + org.apache.maven.plugins + maven-pdf-plugin + + + pdf + package + + pdf + + + + + + + diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyListing.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyListing.java new file mode 100644 index 00000000000..8965e0a54e8 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/CopyListing.java @@ -0,0 +1,218 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.apache.hadoop.security.Credentials; + +import java.io.IOException; + +/** + * The CopyListing abstraction is responsible for how the list of + * sources and targets is constructed, for DistCp's copy function. + * The copy-listing should be a SequenceFile, + * located at the path specified to buildListing(), + * each entry being a pair of (Source relative path, source file status), + * all the paths being fully qualified. + */ +public abstract class CopyListing extends Configured { + + private Credentials credentials; + + /** + * Build listing function creates the input listing that distcp uses to + * perform the copy. + * + * The build listing is a sequence file that has relative path of a file in the key + * and the file status information of the source file in the value + * + * For instance if the source path is /tmp/data and the traversed path is + * /tmp/data/dir1/dir2/file1, then the sequence file would contain + * + * key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1) + * + * File would also contain directory entries. Meaning, if /tmp/data/dir1/dir2/file1 + * is the only file under /tmp/data, the resulting sequence file would contain the + * following entries + * + * key: /dir1 and value: FileStatus(/tmp/data/dir1) + * key: /dir1/dir2 and value: FileStatus(/tmp/data/dir1/dir2) + * key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1) + * + * Cases requiring special handling: + * If source path is a file (/tmp/file1), contents of the file will be as follows + * + * TARGET DOES NOT EXIST: Key-"", Value-FileStatus(/tmp/file1) + * TARGET IS FILE : Key-"", Value-FileStatus(/tmp/file1) + * TARGET IS DIR : Key-"/file1", Value-FileStatus(/tmp/file1) + * + * @param pathToListFile - Output file where the listing would be stored + * @param options - Input options to distcp + * @throws IOException - Exception if any + */ + public final void buildListing(Path pathToListFile, + DistCpOptions options) throws IOException { + validatePaths(options); + doBuildListing(pathToListFile, options); + Configuration config = getConf(); + + config.set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, pathToListFile.toString()); + config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, getBytesToCopy()); + config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, getNumberOfPaths()); + + checkForDuplicates(pathToListFile); + } + + /** + * Validate input and output paths + * + * @param options - Input options + * @throws InvalidInputException: If inputs are invalid + * @throws IOException: any Exception with FS + */ + protected abstract void validatePaths(DistCpOptions options) + throws IOException, InvalidInputException; + + /** + * The interface to be implemented by sub-classes, to create the source/target file listing. + * @param pathToListFile Path on HDFS where the listing file is written. + * @param options Input Options for DistCp (indicating source/target paths.) + * @throws IOException: Thrown on failure to create the listing file. + */ + protected abstract void doBuildListing(Path pathToListFile, + DistCpOptions options) throws IOException; + + /** + * Return the total bytes that distCp should copy for the source paths + * This doesn't consider whether file is same should be skipped during copy + * + * @return total bytes to copy + */ + protected abstract long getBytesToCopy(); + + /** + * Return the total number of paths to distcp, includes directories as well + * This doesn't consider whether file/dir is already present and should be skipped during copy + * + * @return Total number of paths to distcp + */ + protected abstract long getNumberOfPaths(); + + /** + * Validate the final resulting path listing to see if there are any duplicate entries + * + * @param pathToListFile - path listing build by doBuildListing + * @throws IOException - Any issues while checking for duplicates and throws + * @throws DuplicateFileException - if there are duplicates + */ + private void checkForDuplicates(Path pathToListFile) + throws DuplicateFileException, IOException { + + Configuration config = getConf(); + FileSystem fs = pathToListFile.getFileSystem(config); + + Path sortedList = DistCpUtils.sortListing(fs, config, pathToListFile); + + SequenceFile.Reader reader = new SequenceFile.Reader( + config, SequenceFile.Reader.file(sortedList)); + try { + Text lastKey = new Text("*"); //source relative path can never hold * + FileStatus lastFileStatus = new FileStatus(); + + Text currentKey = new Text(); + while (reader.next(currentKey)) { + if (currentKey.equals(lastKey)) { + FileStatus currentFileStatus = new FileStatus(); + reader.getCurrentValue(currentFileStatus); + throw new DuplicateFileException("File " + lastFileStatus.getPath() + " and " + + currentFileStatus.getPath() + " would cause duplicates. Aborting"); + } + reader.getCurrentValue(lastFileStatus); + lastKey.set(currentKey); + } + } finally { + IOUtils.closeStream(reader); + } + } + + /** + * Protected constructor, to initialize configuration. + * @param configuration The input configuration, + * with which the source/target FileSystems may be accessed. + * @param credentials - Credentials object on which the FS delegation tokens are cached.If null + * delegation token caching is skipped + */ + protected CopyListing(Configuration configuration, Credentials credentials) { + setConf(configuration); + setCredentials(credentials); + } + + /** + * set Credentials store, on which FS delegatin token will be cached + * @param credentials - Credentials object + */ + protected void setCredentials(Credentials credentials) { + this.credentials = credentials; + } + + /** + * get credentials to update the delegation tokens for accessed FS objects + * @return Credentials object + */ + protected Credentials getCredentials() { + return credentials; + } + + /** + * Public Factory method with which the appropriate CopyListing implementation may be retrieved. + * @param configuration The input configuration. + * @param credentials Credentials object on which the FS delegation tokens are cached + * @param options The input Options, to help choose the appropriate CopyListing Implementation. + * @return An instance of the appropriate CopyListing implementation. + */ + public static CopyListing getCopyListing(Configuration configuration, + Credentials credentials, + DistCpOptions options) { + if (options.getSourceFileListing() == null) { + return new GlobbedCopyListing(configuration, credentials); + } else { + return new FileBasedCopyListing(configuration, credentials); + } + } + + static class DuplicateFileException extends RuntimeException { + public DuplicateFileException(String message) { + super(message); + } + } + + static class InvalidInputException extends RuntimeException { + public InvalidInputException(String message) { + super(message); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCp.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCp.java new file mode 100644 index 00000000000..6f1df5f4fe7 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCp.java @@ -0,0 +1,405 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.JobSubmissionFiles; +import org.apache.hadoop.mapreduce.Cluster; +import org.apache.hadoop.tools.CopyListing.*; +import org.apache.hadoop.tools.mapred.CopyMapper; +import org.apache.hadoop.tools.mapred.CopyOutputFormat; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import java.io.IOException; +import java.util.Random; + +/** + * DistCp is the main driver-class for DistCpV2. + * For command-line use, DistCp::main() orchestrates the parsing of command-line + * parameters and the launch of the DistCp job. + * For programmatic use, a DistCp object can be constructed by specifying + * options (in a DistCpOptions object), and DistCp::execute() may be used to + * launch the copy-job. DistCp may alternatively be sub-classed to fine-tune + * behaviour. + */ +public class DistCp extends Configured implements Tool { + private static final Log LOG = LogFactory.getLog(DistCp.class); + + private DistCpOptions inputOptions; + private Path metaFolder; + + private static final String PREFIX = "_distcp"; + private static final String WIP_PREFIX = "._WIP_"; + private static final String DISTCP_DEFAULT_XML = "distcp-default.xml"; + public static final Random rand = new Random(); + + private boolean submitted; + private FileSystem jobFS; + + /** + * Public Constructor. Creates DistCp object with specified input-parameters. + * (E.g. source-paths, target-location, etc.) + * @param inputOptions Options (indicating source-paths, target-location.) + * @param configuration The Hadoop configuration against which the Copy-mapper must run. + * @throws Exception, on failure. + */ + public DistCp(Configuration configuration, DistCpOptions inputOptions) throws Exception { + Configuration config = new Configuration(configuration); + config.addResource(DISTCP_DEFAULT_XML); + setConf(config); + this.inputOptions = inputOptions; + this.metaFolder = createMetaFolderPath(); + } + + /** + * To be used with the ToolRunner. Not for public consumption. + */ + private DistCp() {} + + /** + * Implementation of Tool::run(). Orchestrates the copy of source file(s) + * to target location, by: + * 1. Creating a list of files to be copied to target. + * 2. Launching a Map-only job to copy the files. (Delegates to execute().) + * @param argv List of arguments passed to DistCp, from the ToolRunner. + * @return On success, it returns 0. Else, -1. + */ + public int run(String[] argv) { + try { + inputOptions = (OptionsParser.parse(argv)); + + LOG.info("Input Options: " + inputOptions); + } catch (Throwable e) { + LOG.error("Invalid arguments: ", e); + System.err.println("Invalid arguments: " + e.getMessage()); + OptionsParser.usage(); + return DistCpConstants.INVALID_ARGUMENT; + } + + try { + execute(); + } catch (InvalidInputException e) { + LOG.error("Invalid input: ", e); + return DistCpConstants.INVALID_ARGUMENT; + } catch (DuplicateFileException e) { + LOG.error("Duplicate files in input path: ", e); + return DistCpConstants.DUPLICATE_INPUT; + } catch (Exception e) { + LOG.error("Exception encountered ", e); + return DistCpConstants.UNKNOWN_ERROR; + } + return DistCpConstants.SUCCESS; + } + + /** + * Implements the core-execution. Creates the file-list for copy, + * and launches the Hadoop-job, to do the copy. + * @return Job handle + * @throws Exception, on failure. + */ + public Job execute() throws Exception { + assert inputOptions != null; + assert getConf() != null; + + Job job = null; + try { + metaFolder = createMetaFolderPath(); + jobFS = metaFolder.getFileSystem(getConf()); + + job = createJob(); + createInputFileListing(job); + + job.submit(); + submitted = true; + } finally { + if (!submitted) { + cleanup(); + } + } + + String jobID = job.getJobID().toString(); + job.getConfiguration().set(DistCpConstants.CONF_LABEL_DISTCP_JOB_ID, jobID); + + LOG.info("DistCp job-id: " + jobID); + if (inputOptions.shouldBlock()) { + job.waitForCompletion(true); + } + return job; + } + + /** + * Create Job object for submitting it, with all the configuration + * + * @return Reference to job object. + * @throws IOException - Exception if any + */ + private Job createJob() throws IOException { + String jobName = "distcp"; + String userChosenName = getConf().get(JobContext.JOB_NAME); + if (userChosenName != null) + jobName += ": " + userChosenName; + Job job = Job.getInstance(getConf()); + job.setJobName(jobName); + job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), inputOptions)); + job.setJarByClass(CopyMapper.class); + configureOutputFormat(job); + + job.setMapperClass(CopyMapper.class); + job.setNumReduceTasks(0); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(Text.class); + job.setOutputFormatClass(CopyOutputFormat.class); + job.getConfiguration().set(JobContext.MAP_SPECULATIVE, "false"); + job.getConfiguration().set(JobContext.NUM_MAPS, + String.valueOf(inputOptions.getMaxMaps())); + + if (inputOptions.getSslConfigurationFile() != null) { + setupSSLConfig(job); + } + + inputOptions.appendToConf(job.getConfiguration()); + return job; + } + + /** + * Setup ssl configuration on the job configuration to enable hsftp access + * from map job. Also copy the ssl configuration file to Distributed cache + * + * @param job - Reference to job's handle + * @throws java.io.IOException - Exception if unable to locate ssl config file + */ + private void setupSSLConfig(Job job) throws IOException { + Configuration configuration = job.getConfiguration(); + Path sslConfigPath = new Path(configuration. + getResource(inputOptions.getSslConfigurationFile()).toString()); + + addSSLFilesToDistCache(job, sslConfigPath); + configuration.set(DistCpConstants.CONF_LABEL_SSL_CONF, sslConfigPath.getName()); + configuration.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfigPath.getName()); + } + + /** + * Add SSL files to distributed cache. Trust store, key store and ssl config xml + * + * @param job - Job handle + * @param sslConfigPath - ssl Configuration file specified through options + * @throws IOException - If any + */ + private void addSSLFilesToDistCache(Job job, + Path sslConfigPath) throws IOException { + Configuration configuration = job.getConfiguration(); + FileSystem localFS = FileSystem.getLocal(configuration); + + Configuration sslConf = new Configuration(false); + sslConf.addResource(sslConfigPath); + + Path localStorePath = getLocalStorePath(sslConf, + DistCpConstants.CONF_LABEL_SSL_TRUST_STORE_LOCATION); + job.addCacheFile(localStorePath.makeQualified(localFS.getUri(), + localFS.getWorkingDirectory()).toUri()); + configuration.set(DistCpConstants.CONF_LABEL_SSL_TRUST_STORE_LOCATION, + localStorePath.getName()); + + localStorePath = getLocalStorePath(sslConf, + DistCpConstants.CONF_LABEL_SSL_KEY_STORE_LOCATION); + job.addCacheFile(localStorePath.makeQualified(localFS.getUri(), + localFS.getWorkingDirectory()).toUri()); + configuration.set(DistCpConstants.CONF_LABEL_SSL_KEY_STORE_LOCATION, + localStorePath.getName()); + + job.addCacheFile(sslConfigPath.makeQualified(localFS.getUri(), + localFS.getWorkingDirectory()).toUri()); + + } + + /** + * Get Local Trust store/key store path + * + * @param sslConf - Config from SSL Client xml + * @param storeKey - Key for either trust store or key store + * @return - Path where the store is present + * @throws IOException -If any + */ + private Path getLocalStorePath(Configuration sslConf, String storeKey) throws IOException { + if (sslConf.get(storeKey) != null) { + return new Path(sslConf.get(storeKey)); + } else { + throw new IOException("Store for " + storeKey + " is not set in " + + inputOptions.getSslConfigurationFile()); + } + } + + /** + * Setup output format appropriately + * + * @param job - Job handle + * @throws IOException - Exception if any + */ + private void configureOutputFormat(Job job) throws IOException { + final Configuration configuration = job.getConfiguration(); + Path targetPath = inputOptions.getTargetPath(); + FileSystem targetFS = targetPath.getFileSystem(configuration); + targetPath = targetPath.makeQualified(targetFS.getUri(), + targetFS.getWorkingDirectory()); + + if (inputOptions.shouldAtomicCommit()) { + Path workDir = inputOptions.getAtomicWorkPath(); + if (workDir == null) { + workDir = targetPath.getParent(); + } + workDir = new Path(workDir, WIP_PREFIX + targetPath.getName() + + rand.nextInt()); + FileSystem workFS = workDir.getFileSystem(configuration); + if (!DistCpUtils.compareFs(targetFS, workFS)) { + throw new IllegalArgumentException("Work path " + workDir + + " and target path " + targetPath + " are in different file system"); + } + CopyOutputFormat.setWorkingDirectory(job, workDir); + } else { + CopyOutputFormat.setWorkingDirectory(job, targetPath); + } + CopyOutputFormat.setCommitDirectory(job, targetPath); + + Path logPath = inputOptions.getLogPath(); + if (logPath == null) { + logPath = new Path(metaFolder, "_logs"); + } else { + LOG.info("DistCp job log path: " + logPath); + } + CopyOutputFormat.setOutputPath(job, logPath); + } + + /** + * Create input listing by invoking an appropriate copy listing + * implementation. Also add delegation tokens for each path + * to job's credential store + * + * @param job - Handle to job + * @return Returns the path where the copy listing is created + * @throws IOException - If any + */ + private Path createInputFileListing(Job job) throws IOException { + Path fileListingPath = getFileListingPath(); + CopyListing copyListing = CopyListing.getCopyListing(job.getConfiguration(), + job.getCredentials(), inputOptions); + copyListing.buildListing(fileListingPath, inputOptions); + return fileListingPath; + } + + /** + * Get default name of the copy listing file. Use the meta folder + * to create the copy listing file + * + * @return - Path where the copy listing file has to be saved + * @throws IOException - Exception if any + */ + private Path getFileListingPath() throws IOException { + String fileListPathStr = metaFolder + "/fileList.seq"; + Path path = new Path(fileListPathStr); + return new Path(path.toUri().normalize().toString()); + } + + /** + * Create a default working folder for the job, under the + * job staging directory + * + * @return Returns the working folder information + * @throws Exception - EXception if any + */ + private Path createMetaFolderPath() throws Exception { + Configuration configuration = getConf(); + Path stagingDir = JobSubmissionFiles.getStagingDir( + new Cluster(configuration), configuration); + Path metaFolderPath = new Path(stagingDir, PREFIX + String.valueOf(rand.nextInt())); + if (LOG.isDebugEnabled()) + LOG.debug("Meta folder location: " + metaFolderPath); + configuration.set(DistCpConstants.CONF_LABEL_META_FOLDER, metaFolderPath.toString()); + return metaFolderPath; + } + + /** + * Main function of the DistCp program. Parses the input arguments (via OptionsParser), + * and invokes the DistCp::run() method, via the ToolRunner. + * @param argv Command-line arguments sent to DistCp. + */ + public static void main(String argv[]) { + try { + DistCp distCp = new DistCp(); + Cleanup CLEANUP = new Cleanup(distCp); + + Runtime.getRuntime().addShutdownHook(CLEANUP); + System.exit(ToolRunner.run(getDefaultConf(), distCp, argv)); + } + catch (Exception e) { + LOG.error("Couldn't complete DistCp operation: ", e); + System.exit(DistCpConstants.UNKNOWN_ERROR); + } + } + + /** + * Loads properties from distcp-default.xml into configuration + * object + * @return Configuration which includes properties from distcp-default.xml + */ + private static Configuration getDefaultConf() { + Configuration config = new Configuration(); + config.addResource(DISTCP_DEFAULT_XML); + return config; + } + + private synchronized void cleanup() { + try { + if (metaFolder == null) return; + + jobFS.delete(metaFolder, true); + metaFolder = null; + } catch (IOException e) { + LOG.error("Unable to cleanup meta folder: " + metaFolder, e); + } + } + + private boolean isSubmitted() { + return submitted; + } + + private static class Cleanup extends Thread { + private final DistCp distCp; + + public Cleanup(DistCp distCp) { + this.distCp = distCp; + } + + @Override + public void run() { + if (distCp.isSubmitted()) return; + + distCp.cleanup(); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java new file mode 100644 index 00000000000..705b7f925c4 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpConstants.java @@ -0,0 +1,104 @@ +package org.apache.hadoop.tools; + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Utility class to hold commonly used constants. + */ +public class DistCpConstants { + + /* Default number of maps to use for DistCp */ + public static final int DEFAULT_MAPS = 20; + + /* Default bandwidth if none specified */ + public static final int DEFAULT_BANDWIDTH_MB = 100; + + /* Default strategy for copying. Implementation looked up + from distcp-default.xml + */ + public static final String UNIFORMSIZE = "uniformsize"; + + /** + * Constants mapping to command line switches/input options + */ + public static final String CONF_LABEL_ATOMIC_COPY = "distcp.atomic.copy"; + public static final String CONF_LABEL_WORK_PATH = "distcp.work.path"; + public static final String CONF_LABEL_LOG_PATH = "distcp.log.path"; + public static final String CONF_LABEL_IGNORE_FAILURES = "distcp.ignore.failures"; + public static final String CONF_LABEL_PRESERVE_STATUS = "distcp.preserve.status"; + public static final String CONF_LABEL_SYNC_FOLDERS = "distcp.sync.folders"; + public static final String CONF_LABEL_DELETE_MISSING = "distcp.delete.missing.source"; + public static final String CONF_LABEL_SSL_CONF = "distcp.keystore.resource"; + public static final String CONF_LABEL_MAX_MAPS = "distcp.max.maps"; + public static final String CONF_LABEL_SOURCE_LISTING = "distcp.source.listing"; + public static final String CONF_LABEL_COPY_STRATEGY = "distcp.copy.strategy"; + public static final String CONF_LABEL_SKIP_CRC = "distcp.skip.crc"; + public static final String CONF_LABEL_OVERWRITE = "distcp.copy.overwrite"; + public static final String CONF_LABEL_BANDWIDTH_MB = "distcp.map.bandwidth.mb"; + + /* Total bytes to be copied. Updated by copylisting. Unfiltered count */ + public static final String CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED = "mapred.total.bytes.expected"; + + /* Total number of paths to copy, includes directories. Unfiltered count */ + public static final String CONF_LABEL_TOTAL_NUMBER_OF_RECORDS = "mapred.number.of.records"; + + /* SSL keystore resource */ + public static final String CONF_LABEL_SSL_KEYSTORE = "dfs.https.client.keystore.resource"; + + /* If input is based -f <>, file containing the src paths */ + public static final String CONF_LABEL_LISTING_FILE_PATH = "distcp.listing.file.path"; + + /* Directory where the mapreduce job will write to. If not atomic commit, then same + as CONF_LABEL_TARGET_FINAL_PATH + */ + public static final String CONF_LABEL_TARGET_WORK_PATH = "distcp.target.work.path"; + + /* Directory where the final data will be committed to. If not atomic commit, then same + as CONF_LABEL_TARGET_WORK_PATH + */ + public static final String CONF_LABEL_TARGET_FINAL_PATH = "distcp.target.final.path"; + + /** + * DistCp job id for consumers of the Disctp + */ + public static final String CONF_LABEL_DISTCP_JOB_ID = "distcp.job.id"; + + /* Meta folder where the job's intermediate data is kept */ + public static final String CONF_LABEL_META_FOLDER = "distcp.meta.folder"; + + /** + * Conf label for SSL Trust-store location. + */ + public static final String CONF_LABEL_SSL_TRUST_STORE_LOCATION + = "ssl.client.truststore.location"; + + /** + * Conf label for SSL Key-store location. + */ + public static final String CONF_LABEL_SSL_KEY_STORE_LOCATION + = "ssl.client.keystore.location"; + + /** + * Constants for DistCp return code to shell / consumer of ToolRunner's run + */ + public static final int SUCCESS = 0; + public static final int INVALID_ARGUMENT = -1; + public static final int DUPLICATE_INPUT = -2; + public static final int UNKNOWN_ERROR = -999; +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java new file mode 100644 index 00000000000..6cecf0618f8 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptionSwitch.java @@ -0,0 +1,218 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.commons.cli.Option; +import org.apache.hadoop.conf.Configuration; + +/** + * Enumeration mapping configuration keys to distcp command line + * options. + */ +public enum DistCpOptionSwitch { + + /** + * Ignores any failures during copy, and continues with rest. + * Logs failures in a file + */ + IGNORE_FAILURES(DistCpConstants.CONF_LABEL_IGNORE_FAILURES, + new Option("i", false, "Ignore failures during copy")), + + /** + * Preserves status of file/path in the target. + * Default behavior with -p, is to preserve replication, + * block size, user, group and permission on the target file + * + * If any of the optional switches are present among rbugp, then + * only the corresponding file attribute is preserved + * + */ + PRESERVE_STATUS(DistCpConstants.CONF_LABEL_PRESERVE_STATUS, + new Option("p", true, "preserve status (rbugp)" + + "(replication, block-size, user, group, permission)")), + + /** + * Update target location by copying only files that are missing + * in the target. This can be used to periodically sync two folders + * across source and target. Typically used with DELETE_MISSING + * Incompatible with ATOMIC_COMMIT + */ + SYNC_FOLDERS(DistCpConstants.CONF_LABEL_SYNC_FOLDERS, + new Option("update", false, "Update target, copying only missing" + + "files or directories")), + + /** + * Deletes missing files in target that are missing from source + * This allows the target to be in sync with the source contents + * Typically used in conjunction with SYNC_FOLDERS + * Incompatible with ATOMIC_COMMIT + */ + DELETE_MISSING(DistCpConstants.CONF_LABEL_DELETE_MISSING, + new Option("delete", false, "Delete from target, " + + "files missing in source")), + + /** + * Configuration file to use with hftps:// for securely copying + * files across clusters. Typically the configuration file contains + * truststore/keystore information such as location, password and type + */ + SSL_CONF(DistCpConstants.CONF_LABEL_SSL_CONF, + new Option("mapredSslConf", true, "Configuration for ssl config file" + + ", to use with hftps://")), + + /** + * Max number of maps to use during copy. DistCp will split work + * as equally as possible among these maps + */ + MAX_MAPS(DistCpConstants.CONF_LABEL_MAX_MAPS, + new Option("m", true, "Max number of concurrent maps to use for copy")), + + /** + * Source file listing can be provided to DistCp in a file. + * This allows DistCp to copy random list of files from source + * and copy them to target + */ + SOURCE_FILE_LISTING(DistCpConstants.CONF_LABEL_SOURCE_LISTING, + new Option("f", true, "List of files that need to be copied")), + + /** + * Copy all the source files and commit them atomically to the target + * This is typically useful in cases where there is a process + * polling for availability of a file/dir. This option is incompatible + * with SYNC_FOLDERS & DELETE_MISSING + */ + ATOMIC_COMMIT(DistCpConstants.CONF_LABEL_ATOMIC_COPY, + new Option("atomic", false, "Commit all changes or none")), + + /** + * Work path to be used only in conjunction in Atomic commit + */ + WORK_PATH(DistCpConstants.CONF_LABEL_WORK_PATH, + new Option("tmp", true, "Intermediate work path to be used for atomic commit")), + + /** + * Log path where distcp output logs are written to + */ + LOG_PATH(DistCpConstants.CONF_LABEL_LOG_PATH, + new Option("log", true, "Folder on DFS where distcp execution logs are saved")), + + /** + * Copy strategy is use. This could be dynamic or uniform size etc. + * DistCp would use an appropriate input format based on this. + */ + COPY_STRATEGY(DistCpConstants.CONF_LABEL_COPY_STRATEGY, + new Option("strategy", true, "Copy strategy to use. Default is " + + "dividing work based on file sizes")), + + /** + * Skip CRC checks between source and target, when determining what + * files need to be copied. + */ + SKIP_CRC(DistCpConstants.CONF_LABEL_SKIP_CRC, + new Option("skipcrccheck", false, "Whether to skip CRC checks between " + + "source and target paths.")), + + /** + * Overwrite target-files unconditionally. + */ + OVERWRITE(DistCpConstants.CONF_LABEL_OVERWRITE, + new Option("overwrite", false, "Choose to overwrite target files " + + "unconditionally, even if they exist.")), + + /** + * Should DisctpExecution be blocking + */ + BLOCKING("", + new Option("async", false, "Should distcp execution be blocking")), + + FILE_LIMIT("", + new Option("filelimit", true, "(Deprecated!) Limit number of files " + + "copied to <= n")), + + SIZE_LIMIT("", + new Option("sizelimit", true, "(Deprecated!) Limit number of files " + + "copied to <= n bytes")), + + /** + * Specify bandwidth per map in MB + */ + BANDWIDTH(DistCpConstants.CONF_LABEL_BANDWIDTH_MB, + new Option("bandwidth", true, "Specify bandwidth per map in MB")); + + private final String confLabel; + private final Option option; + + DistCpOptionSwitch(String confLabel, Option option) { + this.confLabel = confLabel; + this.option = option; + } + + /** + * Get Configuration label for the option + * @return configuration label name + */ + public String getConfigLabel() { + return confLabel; + } + + /** + * Get CLI Option corresponding to the distcp option + * @return option + */ + public Option getOption() { + return option; + } + + /** + * Get Switch symbol + * @return switch symbol char + */ + public String getSwitch() { + return option.getOpt(); + } + + @Override + public String toString() { + return super.name() + " {" + + "confLabel='" + confLabel + '\'' + + ", option=" + option + '}'; + } + + /** + * Helper function to add an option to hadoop configuration object + * @param conf - Configuration object to include the option + * @param option - Option to add + * @param value - Value + */ + public static void addToConf(Configuration conf, + DistCpOptionSwitch option, + String value) { + conf.set(option.getConfigLabel(), value); + } + + /** + * Helper function to set an option to hadoop configuration object + * @param conf - Configuration object to include the option + * @param option - Option to add + */ + public static void addToConf(Configuration conf, + DistCpOptionSwitch option) { + conf.set(option.getConfigLabel(), "true"); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java new file mode 100644 index 00000000000..87c3e8a1d57 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/DistCpOptions.java @@ -0,0 +1,525 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.tools.util.DistCpUtils; + +import java.util.EnumSet; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * The Options class encapsulates all DistCp options. + * These may be set from command-line (via the OptionsParser) + * or may be set manually. + */ +public class DistCpOptions { + + private boolean atomicCommit = false; + private boolean syncFolder = false; + private boolean deleteMissing = false; + private boolean ignoreFailures = false; + private boolean overwrite = false; + private boolean skipCRC = false; + private boolean blocking = true; + + private int maxMaps = DistCpConstants.DEFAULT_MAPS; + private int mapBandwidth = DistCpConstants.DEFAULT_BANDWIDTH_MB; + + private String sslConfigurationFile; + + private String copyStrategy = DistCpConstants.UNIFORMSIZE; + + private EnumSet preserveStatus = EnumSet.noneOf(FileAttribute.class); + + private Path atomicWorkPath; + + private Path logPath; + + private Path sourceFileListing; + private List sourcePaths; + + private Path targetPath; + + public static enum FileAttribute{ + REPLICATION, BLOCKSIZE, USER, GROUP, PERMISSION; + + public static FileAttribute getAttribute(char symbol) { + for (FileAttribute attribute : values()) { + if (attribute.name().charAt(0) == Character.toUpperCase(symbol)) { + return attribute; + } + } + throw new NoSuchElementException("No attribute for " + symbol); + } + } + + /** + * Constructor, to initialize source/target paths. + * @param sourcePaths List of source-paths (including wildcards) + * to be copied to target. + * @param targetPath Destination path for the dist-copy. + */ + public DistCpOptions(List sourcePaths, Path targetPath) { + assert sourcePaths != null && !sourcePaths.isEmpty() : "Invalid source paths"; + assert targetPath != null : "Invalid Target path"; + + this.sourcePaths = sourcePaths; + this.targetPath = targetPath; + } + + /** + * Constructor, to initialize source/target paths. + * @param sourceFileListing File containing list of source paths + * @param targetPath Destination path for the dist-copy. + */ + public DistCpOptions(Path sourceFileListing, Path targetPath) { + assert sourceFileListing != null : "Invalid source paths"; + assert targetPath != null : "Invalid Target path"; + + this.sourceFileListing = sourceFileListing; + this.targetPath = targetPath; + } + + /** + * Copy constructor. + * @param that DistCpOptions being copied from. + */ + public DistCpOptions(DistCpOptions that) { + if (this != that && that != null) { + this.atomicCommit = that.atomicCommit; + this.syncFolder = that.syncFolder; + this.deleteMissing = that.deleteMissing; + this.ignoreFailures = that.ignoreFailures; + this.overwrite = that.overwrite; + this.skipCRC = that.skipCRC; + this.blocking = that.blocking; + this.maxMaps = that.maxMaps; + this.mapBandwidth = that.mapBandwidth; + this.sslConfigurationFile = that.getSslConfigurationFile(); + this.copyStrategy = that.copyStrategy; + this.preserveStatus = that.preserveStatus; + this.atomicWorkPath = that.getAtomicWorkPath(); + this.logPath = that.getLogPath(); + this.sourceFileListing = that.getSourceFileListing(); + this.sourcePaths = that.getSourcePaths(); + this.targetPath = that.getTargetPath(); + } + } + + /** + * Should the data be committed atomically? + * + * @return true if data should be committed automically. false otherwise + */ + public boolean shouldAtomicCommit() { + return atomicCommit; + } + + /** + * Set if data need to be committed automatically + * + * @param atomicCommit - boolean switch + */ + public void setAtomicCommit(boolean atomicCommit) { + validate(DistCpOptionSwitch.ATOMIC_COMMIT, atomicCommit); + this.atomicCommit = atomicCommit; + } + + /** + * Should the data be sync'ed between source and target paths? + * + * @return true if data should be sync'ed up. false otherwise + */ + public boolean shouldSyncFolder() { + return syncFolder; + } + + /** + * Set if source and target folder contents be sync'ed up + * + * @param syncFolder - boolean switch + */ + public void setSyncFolder(boolean syncFolder) { + validate(DistCpOptionSwitch.SYNC_FOLDERS, syncFolder); + this.syncFolder = syncFolder; + } + + /** + * Should target files missing in source should be deleted? + * + * @return true if zoombie target files to be removed. false otherwise + */ + public boolean shouldDeleteMissing() { + return deleteMissing; + } + + /** + * Set if files only present in target should be deleted + * + * @param deleteMissing - boolean switch + */ + public void setDeleteMissing(boolean deleteMissing) { + validate(DistCpOptionSwitch.DELETE_MISSING, deleteMissing); + this.deleteMissing = deleteMissing; + } + + /** + * Should failures be logged and ignored during copy? + * + * @return true if failures are to be logged and ignored. false otherwise + */ + public boolean shouldIgnoreFailures() { + return ignoreFailures; + } + + /** + * Set if failures during copy be ignored + * + * @param ignoreFailures - boolean switch + */ + public void setIgnoreFailures(boolean ignoreFailures) { + this.ignoreFailures = ignoreFailures; + } + + /** + * Should DistCp be running in blocking mode + * + * @return true if should run in blocking, false otherwise + */ + public boolean shouldBlock() { + return blocking; + } + + /** + * Set if Disctp should run blocking or non-blocking + * + * @param blocking - boolean switch + */ + public void setBlocking(boolean blocking) { + this.blocking = blocking; + } + + /** + * Should files be overwritten always? + * + * @return true if files in target that may exist before distcp, should always + * be overwritten. false otherwise + */ + public boolean shouldOverwrite() { + return overwrite; + } + + /** + * Set if files should always be overwritten on target + * + * @param overwrite - boolean switch + */ + public void setOverwrite(boolean overwrite) { + validate(DistCpOptionSwitch.OVERWRITE, overwrite); + this.overwrite = overwrite; + } + + /** + * Should CRC/checksum check be skipped while checking files are identical + * + * @return true if checksum check should be skipped while checking files are + * identical. false otherwise + */ + public boolean shouldSkipCRC() { + return skipCRC; + } + + /** + * Set if checksum comparison should be skipped while determining if + * source and destination files are identical + * + * @param skipCRC - boolean switch + */ + public void setSkipCRC(boolean skipCRC) { + validate(DistCpOptionSwitch.SKIP_CRC, skipCRC); + this.skipCRC = skipCRC; + } + + /** Get the max number of maps to use for this copy + * + * @return Max number of maps + */ + public int getMaxMaps() { + return maxMaps; + } + + /** + * Set the max number of maps to use for copy + * + * @param maxMaps - Number of maps + */ + public void setMaxMaps(int maxMaps) { + this.maxMaps = maxMaps; + } + + /** Get the map bandwidth in MB + * + * @return Bandwidth in MB + */ + public int getMapBandwidth() { + return mapBandwidth; + } + + /** + * Set per map bandwidth + * + * @param mapBandwidth - per map bandwidth + */ + public void setMapBandwidth(int mapBandwidth) { + assert mapBandwidth > 0 : "Bandwidth " + mapBandwidth + " is invalid (should be > 0)"; + this.mapBandwidth = mapBandwidth; + } + + /** + * Get path where the ssl configuration file is present to use for hftps:// + * + * @return Path on local file system + */ + public String getSslConfigurationFile() { + return sslConfigurationFile; + } + + /** + * Set the SSL configuration file path to use with hftps:// (local path) + * + * @param sslConfigurationFile - Local ssl config file path + */ + public void setSslConfigurationFile(String sslConfigurationFile) { + this.sslConfigurationFile = sslConfigurationFile; + } + + /** + * Returns an iterator with the list of file attributes to preserve + * + * @return iterator of file attributes to preserve + */ + public Iterator preserveAttributes() { + return preserveStatus.iterator(); + } + + /** + * Checks if the input attibute should be preserved or not + * + * @param attribute - Attribute to check + * @return True if attribute should be preserved, false otherwise + */ + public boolean shouldPreserve(FileAttribute attribute) { + return preserveStatus.contains(attribute); + } + + /** + * Add file attributes that need to be preserved. This method may be + * called multiple times to add attributes. + * + * @param fileAttribute - Attribute to add, one at a time + */ + public void preserve(FileAttribute fileAttribute) { + for (FileAttribute attribute : preserveStatus) { + if (attribute.equals(fileAttribute)) { + return; + } + } + preserveStatus.add(fileAttribute); + } + + /** Get work path for atomic commit. If null, the work + * path would be parentOf(targetPath) + "/._WIP_" + nameOf(targetPath) + * + * @return Atomic work path on the target cluster. Null if not set + */ + public Path getAtomicWorkPath() { + return atomicWorkPath; + } + + /** + * Set the work path for atomic commit + * + * @param atomicWorkPath - Path on the target cluster + */ + public void setAtomicWorkPath(Path atomicWorkPath) { + this.atomicWorkPath = atomicWorkPath; + } + + /** Get output directory for writing distcp logs. Otherwise logs + * are temporarily written to JobStagingDir/_logs and deleted + * upon job completion + * + * @return Log output path on the cluster where distcp job is run + */ + public Path getLogPath() { + return logPath; + } + + /** + * Set the log path where distcp output logs are stored + * Uses JobStagingDir/_logs by default + * + * @param logPath - Path where logs will be saved + */ + public void setLogPath(Path logPath) { + this.logPath = logPath; + } + + /** + * Get the copy strategy to use. Uses appropriate input format + * + * @return copy strategy to use + */ + public String getCopyStrategy() { + return copyStrategy; + } + + /** + * Set the copy strategy to use. Should map to a strategy implementation + * in distp-default.xml + * + * @param copyStrategy - copy Strategy to use + */ + public void setCopyStrategy(String copyStrategy) { + this.copyStrategy = copyStrategy; + } + + /** + * File path (hdfs:// or file://) that contains the list of actual + * files to copy + * + * @return - Source listing file path + */ + public Path getSourceFileListing() { + return sourceFileListing; + } + + /** + * Getter for sourcePaths. + * @return List of source-paths. + */ + public List getSourcePaths() { + return sourcePaths; + } + + /** + * Setter for sourcePaths. + * @param sourcePaths The new list of source-paths. + */ + public void setSourcePaths(List sourcePaths) { + assert sourcePaths != null && sourcePaths.size() != 0; + this.sourcePaths = sourcePaths; + } + + /** + * Getter for the targetPath. + * @return The target-path. + */ + public Path getTargetPath() { + return targetPath; + } + + public void validate(DistCpOptionSwitch option, boolean value) { + + boolean syncFolder = (option == DistCpOptionSwitch.SYNC_FOLDERS ? + value : this.syncFolder); + boolean overwrite = (option == DistCpOptionSwitch.OVERWRITE ? + value : this.overwrite); + boolean deleteMissing = (option == DistCpOptionSwitch.DELETE_MISSING ? + value : this.deleteMissing); + boolean atomicCommit = (option == DistCpOptionSwitch.ATOMIC_COMMIT ? + value : this.atomicCommit); + boolean skipCRC = (option == DistCpOptionSwitch.SKIP_CRC ? + value : this.skipCRC); + + if (syncFolder && atomicCommit) { + throw new IllegalArgumentException("Atomic commit can't be used with " + + "sync folder or overwrite options"); + } + + if (deleteMissing && !(overwrite || syncFolder)) { + throw new IllegalArgumentException("Delete missing is applicable " + + "only with update or overwrite options"); + } + + if (overwrite && syncFolder) { + throw new IllegalArgumentException("Overwrite and update options are " + + "mutually exclusive"); + } + + if (!syncFolder && skipCRC) { + throw new IllegalArgumentException("Skip CRC is valid only with update options"); + } + + } + + /** + * Add options to configuration. These will be used in the Mapper/committer + * + * @param conf - Configruation object to which the options need to be added + */ + public void appendToConf(Configuration conf) { + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.ATOMIC_COMMIT, + String.valueOf(atomicCommit)); + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.IGNORE_FAILURES, + String.valueOf(ignoreFailures)); + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SYNC_FOLDERS, + String.valueOf(syncFolder)); + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.DELETE_MISSING, + String.valueOf(deleteMissing)); + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.OVERWRITE, + String.valueOf(overwrite)); + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.SKIP_CRC, + String.valueOf(skipCRC)); + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.BANDWIDTH, + String.valueOf(mapBandwidth)); + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.PRESERVE_STATUS, + DistCpUtils.packAttributes(preserveStatus)); + } + + /** + * Utility to easily string-ify Options, for logging. + * + * @return String representation of the Options. + */ + @Override + public String toString() { + return "DistCpOptions{" + + "atomicCommit=" + atomicCommit + + ", syncFolder=" + syncFolder + + ", deleteMissing=" + deleteMissing + + ", ignoreFailures=" + ignoreFailures + + ", maxMaps=" + maxMaps + + ", sslConfigurationFile='" + sslConfigurationFile + '\'' + + ", copyStrategy='" + copyStrategy + '\'' + + ", sourceFileListing=" + sourceFileListing + + ", sourcePaths=" + sourcePaths + + ", targetPath=" + targetPath + + '}'; + } + + @Override + protected DistCpOptions clone() throws CloneNotSupportedException { + return (DistCpOptions) super.clone(); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/FileBasedCopyListing.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/FileBasedCopyListing.java new file mode 100644 index 00000000000..0fe93c2f136 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/FileBasedCopyListing.java @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.security.Credentials; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + +/** + * FileBasedCopyListing implements the CopyListing interface, + * to create the copy-listing for DistCp, + * by iterating over all source paths mentioned in a specified input-file. + */ +public class FileBasedCopyListing extends CopyListing { + + private final CopyListing globbedListing; + /** + * Constructor, to initialize base-class. + * @param configuration The input Configuration object. + * @param credentials - Credentials object on which the FS delegation tokens are cached. If null + * delegation token caching is skipped + */ + public FileBasedCopyListing(Configuration configuration, Credentials credentials) { + super(configuration, credentials); + globbedListing = new GlobbedCopyListing(getConf(), credentials); + } + + /** {@inheritDoc} */ + @Override + protected void validatePaths(DistCpOptions options) + throws IOException, InvalidInputException { + } + + /** + * Implementation of CopyListing::buildListing(). + * Iterates over all source paths mentioned in the input-file. + * @param pathToListFile Path on HDFS where the listing file is written. + * @param options Input Options for DistCp (indicating source/target paths.) + * @throws IOException + */ + @Override + public void doBuildListing(Path pathToListFile, DistCpOptions options) throws IOException { + DistCpOptions newOption = new DistCpOptions(options); + newOption.setSourcePaths(fetchFileList(options.getSourceFileListing())); + globbedListing.buildListing(pathToListFile, newOption); + } + + private List fetchFileList(Path sourceListing) throws IOException { + List result = new ArrayList(); + FileSystem fs = sourceListing.getFileSystem(getConf()); + BufferedReader input = null; + try { + input = new BufferedReader(new InputStreamReader(fs.open(sourceListing))); + String line = input.readLine(); + while (line != null) { + result.add(new Path(line)); + line = input.readLine(); + } + } finally { + IOUtils.closeStream(input); + } + return result; + } + + /** {@inheritDoc} */ + @Override + protected long getBytesToCopy() { + return globbedListing.getBytesToCopy(); + } + + /** {@inheritDoc} */ + @Override + protected long getNumberOfPaths() { + return globbedListing.getNumberOfPaths(); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java new file mode 100644 index 00000000000..27330b78f72 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/GlobbedCopyListing.java @@ -0,0 +1,105 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.security.Credentials; + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +/** + * GlobbedCopyListing implements the CopyListing interface, to create the copy + * listing-file by "globbing" all specified source paths (wild-cards and all.) + */ +public class GlobbedCopyListing extends CopyListing { + private static final Log LOG = LogFactory.getLog(GlobbedCopyListing.class); + + private final CopyListing simpleListing; + /** + * Constructor, to initialize the configuration. + * @param configuration The input Configuration object. + * @param credentials Credentials object on which the FS delegation tokens are cached. If null + * delegation token caching is skipped + */ + public GlobbedCopyListing(Configuration configuration, Credentials credentials) { + super(configuration, credentials); + simpleListing = new SimpleCopyListing(getConf(), credentials) ; + } + + /** {@inheritDoc} */ + @Override + protected void validatePaths(DistCpOptions options) + throws IOException, InvalidInputException { + } + + /** + * Implementation of CopyListing::buildListing(). + * Creates the copy listing by "globbing" all source-paths. + * @param pathToListingFile The location at which the copy-listing file + * is to be created. + * @param options Input Options for DistCp (indicating source/target paths.) + * @throws IOException + */ + @Override + public void doBuildListing(Path pathToListingFile, + DistCpOptions options) throws IOException { + + List globbedPaths = new ArrayList(); + if (options.getSourcePaths().isEmpty()) { + throw new InvalidInputException("Nothing to process. Source paths::EMPTY"); + } + + for (Path p : options.getSourcePaths()) { + FileSystem fs = p.getFileSystem(getConf()); + FileStatus[] inputs = fs.globStatus(p); + + if(inputs != null && inputs.length > 0) { + for (FileStatus onePath: inputs) { + globbedPaths.add(onePath.getPath()); + } + } else { + throw new InvalidInputException(p + " doesn't exist"); + } + } + + DistCpOptions optionsGlobbed = new DistCpOptions(options); + optionsGlobbed.setSourcePaths(globbedPaths); + simpleListing.buildListing(pathToListingFile, optionsGlobbed); + } + + /** {@inheritDoc} */ + @Override + protected long getBytesToCopy() { + return simpleListing.getBytesToCopy(); + } + + /** {@inheritDoc} */ + @Override + protected long getNumberOfPaths() { + return simpleListing.getNumberOfPaths(); + } + +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java new file mode 100644 index 00000000000..fe78b70448a --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/OptionsParser.java @@ -0,0 +1,246 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.commons.cli.*; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.tools.DistCpOptions.FileAttribute; + +import java.util.*; + +/** + * The OptionsParser parses out the command-line options passed to DistCp, + * and interprets those specific to DistCp, to create an Options object. + */ +public class OptionsParser { + + private static final Log LOG = LogFactory.getLog(OptionsParser.class); + + private static final Options cliOptions = new Options(); + + static { + for (DistCpOptionSwitch option : DistCpOptionSwitch.values()) { + if (LOG.isDebugEnabled()) { + LOG.debug("Adding option " + option.getOption()); + } + cliOptions.addOption(option.getOption()); + } + } + + private static class CustomParser extends GnuParser { + @Override + protected String[] flatten(Options options, String[] arguments, boolean stopAtNonOption) { + for (int index = 0; index < arguments.length; index++) { + if (arguments[index].equals("-" + DistCpOptionSwitch.PRESERVE_STATUS.getSwitch())) { + arguments[index] = "-prbugp"; + } + } + return super.flatten(options, arguments, stopAtNonOption); + } + } + + /** + * The parse method parses the command-line options, and creates + * a corresponding Options object. + * @param args Command-line arguments (excluding the options consumed + * by the GenericOptionsParser). + * @return The Options object, corresponding to the specified command-line. + * @throws IllegalArgumentException: Thrown if the parse fails. + */ + public static DistCpOptions parse(String args[]) throws IllegalArgumentException { + + CommandLineParser parser = new CustomParser(); + + CommandLine command; + try { + command = parser.parse(cliOptions, args, true); + } catch (ParseException e) { + throw new IllegalArgumentException("Unable to parse arguments. " + + Arrays.toString(args), e); + } + + DistCpOptions option; + Path targetPath; + List sourcePaths = new ArrayList(); + + String leftOverArgs[] = command.getArgs(); + if (leftOverArgs == null || leftOverArgs.length < 1) { + throw new IllegalArgumentException("Target path not specified"); + } + + //Last Argument is the target path + targetPath = new Path(leftOverArgs[leftOverArgs.length -1].trim()); + + //Copy any source paths in the arguments to the list + for (int index = 0; index < leftOverArgs.length - 1; index++) { + sourcePaths.add(new Path(leftOverArgs[index].trim())); + } + + /* If command has source file listing, use it else, fall back on source paths in args + If both are present, throw exception and bail */ + if (command.hasOption(DistCpOptionSwitch.SOURCE_FILE_LISTING.getSwitch())) { + if (!sourcePaths.isEmpty()) { + throw new IllegalArgumentException("Both source file listing and source paths present"); + } + option = new DistCpOptions(new Path(getVal(command, DistCpOptionSwitch. + SOURCE_FILE_LISTING.getSwitch())), targetPath); + } else { + if (sourcePaths.isEmpty()) { + throw new IllegalArgumentException("Neither source file listing nor source paths present"); + } + option = new DistCpOptions(sourcePaths, targetPath); + } + + //Process all the other option switches and set options appropriately + if (command.hasOption(DistCpOptionSwitch.IGNORE_FAILURES.getSwitch())) { + option.setIgnoreFailures(true); + } + + if (command.hasOption(DistCpOptionSwitch.ATOMIC_COMMIT.getSwitch())) { + option.setAtomicCommit(true); + } + + if (command.hasOption(DistCpOptionSwitch.WORK_PATH.getSwitch()) && + option.shouldAtomicCommit()) { + String workPath = getVal(command, DistCpOptionSwitch.WORK_PATH.getSwitch()); + if (workPath != null && !workPath.isEmpty()) { + option.setAtomicWorkPath(new Path(workPath)); + } + } else if (command.hasOption(DistCpOptionSwitch.WORK_PATH.getSwitch())) { + throw new IllegalArgumentException("-tmp work-path can only be specified along with -atomic"); + } + + if (command.hasOption(DistCpOptionSwitch.LOG_PATH.getSwitch())) { + option.setLogPath(new Path(getVal(command, DistCpOptionSwitch.LOG_PATH.getSwitch()))); + } + + if (command.hasOption(DistCpOptionSwitch.SYNC_FOLDERS.getSwitch())) { + option.setSyncFolder(true); + } + + if (command.hasOption(DistCpOptionSwitch.OVERWRITE.getSwitch())) { + option.setOverwrite(true); + } + + if (command.hasOption(DistCpOptionSwitch.DELETE_MISSING.getSwitch())) { + option.setDeleteMissing(true); + } + + if (command.hasOption(DistCpOptionSwitch.SKIP_CRC.getSwitch())) { + option.setSkipCRC(true); + } + + if (command.hasOption(DistCpOptionSwitch.BLOCKING.getSwitch())) { + option.setBlocking(false); + } + + if (command.hasOption(DistCpOptionSwitch.BANDWIDTH.getSwitch())) { + try { + Integer mapBandwidth = Integer.parseInt( + getVal(command, DistCpOptionSwitch.BANDWIDTH.getSwitch()).trim()); + option.setMapBandwidth(mapBandwidth); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Bandwidth specified is invalid: " + + getVal(command, DistCpOptionSwitch.BANDWIDTH.getSwitch()), e); + } + } + + if (command.hasOption(DistCpOptionSwitch.SSL_CONF.getSwitch())) { + option.setSslConfigurationFile(command. + getOptionValue(DistCpOptionSwitch.SSL_CONF.getSwitch())); + } + + if (command.hasOption(DistCpOptionSwitch.MAX_MAPS.getSwitch())) { + try { + Integer maps = Integer.parseInt( + getVal(command, DistCpOptionSwitch.MAX_MAPS.getSwitch()).trim()); + option.setMaxMaps(maps); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Number of maps is invalid: " + + getVal(command, DistCpOptionSwitch.MAX_MAPS.getSwitch()), e); + } + } + + if (command.hasOption(DistCpOptionSwitch.COPY_STRATEGY.getSwitch())) { + option.setCopyStrategy( + getVal(command, DistCpOptionSwitch.COPY_STRATEGY.getSwitch())); + } + + if (command.hasOption(DistCpOptionSwitch.PRESERVE_STATUS.getSwitch())) { + String attributes = + getVal(command, DistCpOptionSwitch.PRESERVE_STATUS.getSwitch()); + if (attributes == null || attributes.isEmpty()) { + for (FileAttribute attribute : FileAttribute.values()) { + option.preserve(attribute); + } + } else { + for (int index = 0; index < attributes.length(); index++) { + option.preserve(FileAttribute. + getAttribute(attributes.charAt(index))); + } + } + } + + if (command.hasOption(DistCpOptionSwitch.FILE_LIMIT.getSwitch())) { + String fileLimitString = getVal(command, + DistCpOptionSwitch.FILE_LIMIT.getSwitch().trim()); + try { + Integer.parseInt(fileLimitString); + } + catch (NumberFormatException e) { + throw new IllegalArgumentException("File-limit is invalid: " + + fileLimitString, e); + } + LOG.warn(DistCpOptionSwitch.FILE_LIMIT.getSwitch() + " is a deprecated" + + " option. Ignoring."); + } + + if (command.hasOption(DistCpOptionSwitch.SIZE_LIMIT.getSwitch())) { + String sizeLimitString = getVal(command, + DistCpOptionSwitch.SIZE_LIMIT.getSwitch().trim()); + try { + Long.parseLong(sizeLimitString); + } + catch (NumberFormatException e) { + throw new IllegalArgumentException("Size-limit is invalid: " + + sizeLimitString, e); + } + LOG.warn(DistCpOptionSwitch.SIZE_LIMIT.getSwitch() + " is a deprecated" + + " option. Ignoring."); + } + + return option; + } + + private static String getVal(CommandLine command, String swtch) { + String optionValue = command.getOptionValue(swtch); + if (optionValue == null) { + return null; + } else { + return optionValue.trim(); + } + } + + public static void usage() { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("distcp OPTIONS [source_path...] \n\nOPTIONS", cliOptions); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java new file mode 100644 index 00000000000..53c55b78421 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/SimpleCopyListing.java @@ -0,0 +1,275 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.DataInputBuffer; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.apache.hadoop.mapreduce.security.TokenCache; +import org.apache.hadoop.security.Credentials; + +import java.io.*; +import java.util.Stack; + +/** + * The SimpleCopyListing is responsible for making the exhaustive list of + * all files/directories under its specified list of input-paths. + * These are written into the specified copy-listing file. + * Note: The SimpleCopyListing doesn't handle wild-cards in the input-paths. + */ +public class SimpleCopyListing extends CopyListing { + private static final Log LOG = LogFactory.getLog(SimpleCopyListing.class); + + private long totalPaths = 0; + private long totalBytesToCopy = 0; + + /** + * Protected constructor, to initialize configuration. + * + * @param configuration The input configuration, with which the source/target FileSystems may be accessed. + * @param credentials - Credentials object on which the FS delegation tokens are cached. If null + * delegation token caching is skipped + */ + protected SimpleCopyListing(Configuration configuration, Credentials credentials) { + super(configuration, credentials); + } + + @Override + protected void validatePaths(DistCpOptions options) + throws IOException, InvalidInputException { + + Path targetPath = options.getTargetPath(); + FileSystem targetFS = targetPath.getFileSystem(getConf()); + boolean targetIsFile = targetFS.isFile(targetPath); + + //If target is a file, then source has to be single file + if (targetIsFile) { + if (options.getSourcePaths().size() > 1) { + throw new InvalidInputException("Multiple source being copied to a file: " + + targetPath); + } + + Path srcPath = options.getSourcePaths().get(0); + FileSystem sourceFS = srcPath.getFileSystem(getConf()); + if (!sourceFS.isFile(srcPath)) { + throw new InvalidInputException("Cannot copy " + srcPath + + ", which is not a file to " + targetPath); + } + } + + if (options.shouldAtomicCommit() && targetFS.exists(targetPath)) { + throw new InvalidInputException("Target path for atomic-commit already exists: " + + targetPath + ". Cannot atomic-commit to pre-existing target-path."); + } + + for (Path path: options.getSourcePaths()) { + FileSystem fs = path.getFileSystem(getConf()); + if (!fs.exists(path)) { + throw new InvalidInputException(path + " doesn't exist"); + } + } + + /* This is requires to allow map tasks to access each of the source + clusters. This would retrieve the delegation token for each unique + file system and add them to job's private credential store + */ + Credentials credentials = getCredentials(); + if (credentials != null) { + Path[] inputPaths = options.getSourcePaths().toArray(new Path[1]); + TokenCache.obtainTokensForNamenodes(credentials, inputPaths, getConf()); + } + } + + /** {@inheritDoc} */ + @Override + public void doBuildListing(Path pathToListingFile, DistCpOptions options) throws IOException { + + SequenceFile.Writer fileListWriter = null; + + try { + fileListWriter = getWriter(pathToListingFile); + + for (Path path: options.getSourcePaths()) { + FileSystem sourceFS = path.getFileSystem(getConf()); + path = makeQualified(path); + + FileStatus rootStatus = sourceFS.getFileStatus(path); + Path sourcePathRoot = computeSourceRootPath(rootStatus, options); + boolean localFile = (rootStatus.getClass() != FileStatus.class); + + FileStatus[] sourceFiles = sourceFS.listStatus(path); + if (sourceFiles != null && sourceFiles.length > 0) { + for (FileStatus sourceStatus: sourceFiles) { + if (LOG.isDebugEnabled()) { + LOG.debug("Recording source-path: " + sourceStatus.getPath() + " for copy."); + } + writeToFileListing(fileListWriter, sourceStatus, sourcePathRoot, localFile); + + if (isDirectoryAndNotEmpty(sourceFS, sourceStatus)) { + if (LOG.isDebugEnabled()) { + LOG.debug("Traversing non-empty source dir: " + sourceStatus.getPath()); + } + traverseNonEmptyDirectory(fileListWriter, sourceStatus, sourcePathRoot, localFile); + } + } + } else { + writeToFileListing(fileListWriter, rootStatus, sourcePathRoot, localFile); + } + } + } finally { + IOUtils.closeStream(fileListWriter); + } + } + + private Path computeSourceRootPath(FileStatus sourceStatus, + DistCpOptions options) throws IOException { + + Path target = options.getTargetPath(); + FileSystem targetFS = target.getFileSystem(getConf()); + + boolean solitaryFile = options.getSourcePaths().size() == 1 + && !sourceStatus.isDirectory(); + + if (solitaryFile) { + if (targetFS.isFile(target) || !targetFS.exists(target)) { + return sourceStatus.getPath(); + } else { + return sourceStatus.getPath().getParent(); + } + } else { + boolean specialHandling = (options.getSourcePaths().size() == 1 && !targetFS.exists(target)) || + options.shouldSyncFolder() || options.shouldOverwrite(); + + return specialHandling && sourceStatus.isDirectory() ? sourceStatus.getPath() : + sourceStatus.getPath().getParent(); + } + } + + /** {@inheritDoc} */ + @Override + protected long getBytesToCopy() { + return totalBytesToCopy; + } + + /** {@inheritDoc} */ + @Override + protected long getNumberOfPaths() { + return totalPaths; + } + + private Path makeQualified(Path path) throws IOException { + final FileSystem fs = path.getFileSystem(getConf()); + return path.makeQualified(fs.getUri(), fs.getWorkingDirectory()); + } + + private SequenceFile.Writer getWriter(Path pathToListFile) throws IOException { + FileSystem fs = pathToListFile.getFileSystem(getConf()); + if (fs.exists(pathToListFile)) { + fs.delete(pathToListFile, false); + } + return SequenceFile.createWriter(getConf(), + SequenceFile.Writer.file(pathToListFile), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(FileStatus.class), + SequenceFile.Writer.compression(SequenceFile.CompressionType.NONE)); + } + + private static boolean isDirectoryAndNotEmpty(FileSystem fileSystem, + FileStatus fileStatus) throws IOException { + return fileStatus.isDirectory() && getChildren(fileSystem, fileStatus).length > 0; + } + + private static FileStatus[] getChildren(FileSystem fileSystem, + FileStatus parent) throws IOException { + return fileSystem.listStatus(parent.getPath()); + } + + private void traverseNonEmptyDirectory(SequenceFile.Writer fileListWriter, + FileStatus sourceStatus, + Path sourcePathRoot, boolean localFile) + throws IOException { + FileSystem sourceFS = sourcePathRoot.getFileSystem(getConf()); + Stack pathStack = new Stack(); + pathStack.push(sourceStatus); + + while (!pathStack.isEmpty()) { + for (FileStatus child: getChildren(sourceFS, pathStack.pop())) { + if (LOG.isDebugEnabled()) + LOG.debug("Recording source-path: " + + sourceStatus.getPath() + " for copy."); + writeToFileListing(fileListWriter, child, sourcePathRoot, localFile); + if (isDirectoryAndNotEmpty(sourceFS, child)) { + if (LOG.isDebugEnabled()) + LOG.debug("Traversing non-empty source dir: " + + sourceStatus.getPath()); + pathStack.push(child); + } + } + } + } + + private void writeToFileListing(SequenceFile.Writer fileListWriter, + FileStatus fileStatus, Path sourcePathRoot, + boolean localFile) throws IOException { + if (fileStatus.getPath().equals(sourcePathRoot) && fileStatus.isDirectory()) + return; // Skip the root-paths. + + if (LOG.isDebugEnabled()) { + LOG.debug("REL PATH: " + DistCpUtils.getRelativePath(sourcePathRoot, + fileStatus.getPath()) + ", FULL PATH: " + fileStatus.getPath()); + } + + FileStatus status = fileStatus; + if (localFile) { + status = getFileStatus(fileStatus); + } + + fileListWriter.append(new Text(DistCpUtils.getRelativePath(sourcePathRoot, + fileStatus.getPath())), status); + fileListWriter.sync(); + + if (!fileStatus.isDirectory()) { + totalBytesToCopy += fileStatus.getLen(); + } + totalPaths++; + } + + private static final ByteArrayOutputStream buffer = new ByteArrayOutputStream(64); + private DataInputBuffer in = new DataInputBuffer(); + + private FileStatus getFileStatus(FileStatus fileStatus) throws IOException { + FileStatus status = new FileStatus(); + + buffer.reset(); + DataOutputStream out = new DataOutputStream(buffer); + fileStatus.write(out); + + in.reset(buffer.toByteArray(), 0, buffer.size()); + status.readFields(in); + return status; + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java new file mode 100644 index 00000000000..25f958625f5 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyCommitter.java @@ -0,0 +1,297 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; +import org.apache.hadoop.tools.*; +import org.apache.hadoop.tools.DistCpOptions.FileAttribute; +import org.apache.hadoop.tools.util.DistCpUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +/** + * The CopyCommitter class is DistCp's OutputCommitter implementation. It is + * responsible for handling the completion/cleanup of the DistCp run. + * Specifically, it does the following: + * 1. Cleanup of the meta-folder (where DistCp maintains its file-list, etc.) + * 2. Preservation of user/group/replication-factor on any directories that + * have been copied. (Files are taken care of in their map-tasks.) + * 3. Atomic-move of data from the temporary work-folder to the final path + * (if atomic-commit was opted for). + * 4. Deletion of files from the target that are missing at source (if opted for). + * 5. Cleanup of any partially copied files, from previous, failed attempts. + */ +public class CopyCommitter extends FileOutputCommitter { + private static final Log LOG = LogFactory.getLog(CopyCommitter.class); + + private final TaskAttemptContext taskAttemptContext; + + /** + * Create a output committer + * + * @param outputPath the job's output path + * @param context the task's context + * @throws IOException - Exception if any + */ + public CopyCommitter(Path outputPath, TaskAttemptContext context) throws IOException { + super(outputPath, context); + this.taskAttemptContext = context; + } + + /** @inheritDoc */ + @Override + public void commitJob(JobContext jobContext) throws IOException { + Configuration conf = jobContext.getConfiguration(); + super.commitJob(jobContext); + + cleanupTempFiles(jobContext); + + String attributes = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS); + if (attributes != null && !attributes.isEmpty()) { + preserveFileAttributesForDirectories(conf); + } + + try { + if (conf.getBoolean(DistCpConstants.CONF_LABEL_DELETE_MISSING, false)) { + deleteMissing(conf); + } else if (conf.getBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, false)) { + commitData(conf); + } + taskAttemptContext.setStatus("Commit Successful"); + } + finally { + cleanup(conf); + } + } + + /** @inheritDoc */ + @Override + public void abortJob(JobContext jobContext, + JobStatus.State state) throws IOException { + try { + super.abortJob(jobContext, state); + } finally { + cleanupTempFiles(jobContext); + cleanup(jobContext.getConfiguration()); + } + } + + private void cleanupTempFiles(JobContext context) { + try { + Configuration conf = context.getConfiguration(); + + Path targetWorkPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH)); + FileSystem targetFS = targetWorkPath.getFileSystem(conf); + + String jobId = context.getJobID().toString(); + deleteAttemptTempFiles(targetWorkPath, targetFS, jobId); + deleteAttemptTempFiles(targetWorkPath.getParent(), targetFS, jobId); + } catch (Throwable t) { + LOG.warn("Unable to cleanup temp files", t); + } + } + + private void deleteAttemptTempFiles(Path targetWorkPath, + FileSystem targetFS, + String jobId) throws IOException { + + FileStatus[] tempFiles = targetFS.globStatus( + new Path(targetWorkPath, ".distcp.tmp." + jobId.replaceAll("job","attempt") + "*")); + + if (tempFiles != null && tempFiles.length > 0) { + for (FileStatus file : tempFiles) { + LOG.info("Cleaning up " + file.getPath()); + targetFS.delete(file.getPath(), false); + } + } + } + + /** + * Cleanup meta folder and other temporary files + * + * @param conf - Job Configuration + */ + private void cleanup(Configuration conf) { + Path metaFolder = new Path(conf.get(DistCpConstants.CONF_LABEL_META_FOLDER)); + try { + FileSystem fs = metaFolder.getFileSystem(conf); + LOG.info("Cleaning up temporary work folder: " + metaFolder); + fs.delete(metaFolder, true); + } catch (IOException ignore) { + LOG.error("Exception encountered ", ignore); + } + } + + // This method changes the target-directories' file-attributes (owner, + // user/group permissions, etc.) based on the corresponding source directories. + private void preserveFileAttributesForDirectories(Configuration conf) throws IOException { + String attrSymbols = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS); + LOG.info("About to preserve attributes: " + attrSymbols); + + EnumSet attributes = DistCpUtils.unpackAttributes(attrSymbols); + + Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH)); + FileSystem clusterFS = sourceListing.getFileSystem(conf); + SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf, + SequenceFile.Reader.file(sourceListing)); + long totalLen = clusterFS.getFileStatus(sourceListing).getLen(); + + Path targetRoot = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH)); + + long preservedEntries = 0; + try { + FileStatus srcFileStatus = new FileStatus(); + Text srcRelPath = new Text(); + + // Iterate over every source path that was copied. + while (sourceReader.next(srcRelPath, srcFileStatus)) { + // File-attributes for files are set at the time of copy, + // in the map-task. + if (! srcFileStatus.isDirectory()) continue; + + Path targetFile = new Path(targetRoot.toString() + "/" + srcRelPath); + + // Skip the root folder. + // Status can't be preserved on root-folder. (E.g. multiple paths may + // be copied to a single target folder. Which source-attributes to use + // on the target is undefined.) + if (targetRoot.equals(targetFile)) continue; + + FileSystem targetFS = targetFile.getFileSystem(conf); + DistCpUtils.preserve(targetFS, targetFile, srcFileStatus, attributes); + + taskAttemptContext.progress(); + taskAttemptContext.setStatus("Preserving status on directory entries. [" + + sourceReader.getPosition() * 100 / totalLen + "%]"); + } + } finally { + IOUtils.closeStream(sourceReader); + } + LOG.info("Preserved status on " + preservedEntries + " dir entries on target"); + } + + // This method deletes "extra" files from the target, if they're not + // available at the source. + private void deleteMissing(Configuration conf) throws IOException { + LOG.info("-delete option is enabled. About to remove entries from " + + "target that are missing in source"); + + // Sort the source-file listing alphabetically. + Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH)); + FileSystem clusterFS = sourceListing.getFileSystem(conf); + Path sortedSourceListing = DistCpUtils.sortListing(clusterFS, conf, sourceListing); + + // Similarly, create the listing of target-files. Sort alphabetically. + Path targetListing = new Path(sourceListing.getParent(), "targetListing.seq"); + CopyListing target = new GlobbedCopyListing(new Configuration(conf), null); + + List targets = new ArrayList(1); + Path targetFinalPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH)); + targets.add(targetFinalPath); + DistCpOptions options = new DistCpOptions(targets, new Path("/NONE")); + + target.buildListing(targetListing, options); + Path sortedTargetListing = DistCpUtils.sortListing(clusterFS, conf, targetListing); + long totalLen = clusterFS.getFileStatus(sortedTargetListing).getLen(); + + SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf, + SequenceFile.Reader.file(sortedSourceListing)); + SequenceFile.Reader targetReader = new SequenceFile.Reader(conf, + SequenceFile.Reader.file(sortedTargetListing)); + + // Walk both source and target file listings. + // Delete all from target that doesn't also exist on source. + long deletedEntries = 0; + try { + FileStatus srcFileStatus = new FileStatus(); + Text srcRelPath = new Text(); + FileStatus trgtFileStatus = new FileStatus(); + Text trgtRelPath = new Text(); + + FileSystem targetFS = targetFinalPath.getFileSystem(conf); + boolean srcAvailable = sourceReader.next(srcRelPath, srcFileStatus); + while (targetReader.next(trgtRelPath, trgtFileStatus)) { + // Skip sources that don't exist on target. + while (srcAvailable && trgtRelPath.compareTo(srcRelPath) > 0) { + srcAvailable = sourceReader.next(srcRelPath, srcFileStatus); + } + + if (srcAvailable && trgtRelPath.equals(srcRelPath)) continue; + + // Target doesn't exist at source. Delete. + boolean result = (!targetFS.exists(trgtFileStatus.getPath()) || + targetFS.delete(trgtFileStatus.getPath(), true)); + if (result) { + LOG.info("Deleted " + trgtFileStatus.getPath() + " - Missing at source"); + deletedEntries++; + } else { + throw new IOException("Unable to delete " + trgtFileStatus.getPath()); + } + taskAttemptContext.progress(); + taskAttemptContext.setStatus("Deleting missing files from target. [" + + targetReader.getPosition() * 100 / totalLen + "%]"); + } + } finally { + IOUtils.closeStream(sourceReader); + IOUtils.closeStream(targetReader); + } + LOG.info("Deleted " + deletedEntries + " from target: " + targets.get(0)); + } + + private void commitData(Configuration conf) throws IOException { + + Path workDir = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH)); + Path finalDir = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH)); + FileSystem targetFS = workDir.getFileSystem(conf); + + LOG.info("Atomic commit enabled. Moving " + workDir + " to " + finalDir); + if (targetFS.exists(finalDir) && targetFS.exists(workDir)) { + LOG.error("Pre-existing final-path found at: " + finalDir); + throw new IOException("Target-path can't be committed to because it " + + "exists at " + finalDir + ". Copied data is in temp-dir: " + workDir + ". "); + } + + boolean result = targetFS.rename(workDir, finalDir); + if (!result) { + LOG.warn("Rename failed. Perhaps data already moved. Verifying..."); + result = targetFS.exists(finalDir) && !targetFS.exists(workDir); + } + if (result) { + LOG.info("Data committed successfully to " + finalDir); + taskAttemptContext.setStatus("Data committed successfully to " + finalDir); + } else { + LOG.error("Unable to commit data to " + finalDir); + throw new IOException("Atomic commit failed. Temporary data in " + workDir + + ", Unable to move to " + finalDir); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyMapper.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyMapper.java new file mode 100644 index 00000000000..5562dd86b13 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyMapper.java @@ -0,0 +1,330 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.tools.DistCpConstants; +import org.apache.hadoop.tools.DistCpOptionSwitch; +import org.apache.hadoop.tools.DistCpOptions; +import org.apache.hadoop.tools.DistCpOptions.FileAttribute; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.apache.hadoop.util.StringUtils; + +import java.io.*; +import java.util.EnumSet; +import java.util.Arrays; + +/** + * Mapper class that executes the DistCp copy operation. + * Implements the o.a.h.mapreduce.Mapper<> interface. + */ +public class CopyMapper extends Mapper { + + /** + * Hadoop counters for the DistCp CopyMapper. + * (These have been kept identical to the old DistCp, + * for backward compatibility.) + */ + public static enum Counter { + COPY, // Number of files received by the mapper for copy. + SKIP, // Number of files skipped. + FAIL, // Number of files that failed to be copied. + BYTESCOPIED, // Number of bytes actually copied by the copy-mapper, total. + BYTESEXPECTED,// Number of bytes expected to be copied. + BYTESFAILED, // Number of bytes that failed to be copied. + BYTESSKIPPED, // Number of bytes that were skipped from copy. + } + + private static Log LOG = LogFactory.getLog(CopyMapper.class); + + private Configuration conf; + + private boolean syncFolders = false; + private boolean ignoreFailures = false; + private boolean skipCrc = false; + private boolean overWrite = false; + private EnumSet preserve = EnumSet.noneOf(FileAttribute.class); + + private FileSystem targetFS = null; + private Path targetWorkPath = null; + + /** + * Implementation of the Mapper::setup() method. This extracts the DistCp- + * options specified in the Job's configuration, to set up the Job. + * @param context Mapper's context. + * @throws IOException On IO failure. + * @throws InterruptedException If the job is interrupted. + */ + @Override + public void setup(Context context) throws IOException, InterruptedException { + conf = context.getConfiguration(); + + syncFolders = conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false); + ignoreFailures = conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false); + skipCrc = conf.getBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(), false); + overWrite = conf.getBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(), false); + preserve = DistCpUtils.unpackAttributes(conf.get(DistCpOptionSwitch. + PRESERVE_STATUS.getConfigLabel())); + + targetWorkPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH)); + Path targetFinalPath = new Path(conf.get( + DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH)); + targetFS = targetFinalPath.getFileSystem(conf); + + if (targetFS.exists(targetFinalPath) && targetFS.isFile(targetFinalPath)) { + overWrite = true; // When target is an existing file, overwrite it. + } + + if (conf.get(DistCpConstants.CONF_LABEL_SSL_CONF) != null) { + initializeSSLConf(context); + } + } + + /** + * Initialize SSL Config if same is set in conf + * + * @throws IOException - If any + */ + private void initializeSSLConf(Context context) throws IOException { + LOG.info("Initializing SSL configuration"); + + String workDir = conf.get(JobContext.JOB_LOCAL_DIR) + "/work"; + Path[] cacheFiles = context.getLocalCacheFiles(); + + Configuration sslConfig = new Configuration(false); + String sslConfFileName = conf.get(DistCpConstants.CONF_LABEL_SSL_CONF); + Path sslClient = findCacheFile(cacheFiles, sslConfFileName); + if (sslClient == null) { + LOG.warn("SSL Client config file not found. Was looking for " + sslConfFileName + + " in " + Arrays.toString(cacheFiles)); + return; + } + sslConfig.addResource(sslClient); + + String trustStoreFile = conf.get("ssl.client.truststore.location"); + Path trustStorePath = findCacheFile(cacheFiles, trustStoreFile); + sslConfig.set("ssl.client.truststore.location", trustStorePath.toString()); + + String keyStoreFile = conf.get("ssl.client.keystore.location"); + Path keyStorePath = findCacheFile(cacheFiles, keyStoreFile); + sslConfig.set("ssl.client.keystore.location", keyStorePath.toString()); + + try { + OutputStream out = new FileOutputStream(workDir + "/" + sslConfFileName); + try { + sslConfig.writeXml(out); + } finally { + out.close(); + } + conf.set(DistCpConstants.CONF_LABEL_SSL_KEYSTORE, sslConfFileName); + } catch (IOException e) { + LOG.warn("Unable to write out the ssl configuration. " + + "Will fall back to default ssl-client.xml in class path, if there is one", e); + } + } + + /** + * Find entry from distributed cache + * + * @param cacheFiles - All localized cache files + * @param fileName - fileName to search + * @return Path of the filename if found, else null + */ + private Path findCacheFile(Path[] cacheFiles, String fileName) { + if (cacheFiles != null && cacheFiles.length > 0) { + for (Path file : cacheFiles) { + if (file.getName().equals(fileName)) { + return file; + } + } + } + return null; + } + + /** + * Implementation of the Mapper<>::map(). Does the copy. + * @param relPath The target path. + * @param sourceFileStatus The source path. + * @throws IOException + */ + @Override + public void map(Text relPath, FileStatus sourceFileStatus, Context context) + throws IOException, InterruptedException { + Path sourcePath = sourceFileStatus.getPath(); + + if (LOG.isDebugEnabled()) + LOG.debug("DistCpMapper::map(): Received " + sourcePath + ", " + relPath); + + Path target = new Path(targetWorkPath.makeQualified(targetFS.getUri(), + targetFS.getWorkingDirectory()) + relPath.toString()); + + EnumSet fileAttributes + = getFileAttributeSettings(context); + + final String description = "Copying " + sourcePath + " to " + target; + context.setStatus(description); + + LOG.info(description); + + try { + FileStatus sourceCurrStatus; + FileSystem sourceFS; + try { + sourceFS = sourcePath.getFileSystem(conf); + sourceCurrStatus = sourceFS.getFileStatus(sourcePath); + } catch (FileNotFoundException e) { + throw new IOException(new RetriableFileCopyCommand.CopyReadException(e)); + } + + FileStatus targetStatus = null; + + try { + targetStatus = targetFS.getFileStatus(target); + } catch (FileNotFoundException ignore) { + if (LOG.isDebugEnabled()) + LOG.debug("Path could not be found: " + target, ignore); + } + + if (targetStatus != null && (targetStatus.isDirectory() != sourceCurrStatus.isDirectory())) { + throw new IOException("Can't replace " + target + ". Target is " + + getFileType(targetStatus) + ", Source is " + getFileType(sourceCurrStatus)); + } + + if (sourceCurrStatus.isDirectory()) { + createTargetDirsWithRetry(description, target, context); + return; + } + + if (skipFile(sourceFS, sourceCurrStatus, target)) { + LOG.info("Skipping copy of " + sourceCurrStatus.getPath() + + " to " + target); + updateSkipCounters(context, sourceCurrStatus); + context.write(null, new Text("SKIP: " + sourceCurrStatus.getPath())); + } + else { + copyFileWithRetry(description, sourceCurrStatus, target, context, + fileAttributes); + } + + DistCpUtils.preserve(target.getFileSystem(conf), target, + sourceCurrStatus, fileAttributes); + + } catch (IOException exception) { + handleFailures(exception, sourceFileStatus, target, context); + } + } + + private String getFileType(FileStatus fileStatus) { + return fileStatus == null ? "N/A" : (fileStatus.isDirectory() ? "dir" : "file"); + } + + private static EnumSet + getFileAttributeSettings(Mapper.Context context) { + String attributeString = context.getConfiguration().get( + DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel()); + return DistCpUtils.unpackAttributes(attributeString); + } + + private void copyFileWithRetry(String description, FileStatus sourceFileStatus, + Path target, Context context, + EnumSet fileAttributes) throws IOException { + + long bytesCopied; + try { + bytesCopied = (Long)new RetriableFileCopyCommand(description) + .execute(sourceFileStatus, target, context, fileAttributes); + } catch (Exception e) { + context.setStatus("Copy Failure: " + sourceFileStatus.getPath()); + throw new IOException("File copy failed: " + sourceFileStatus.getPath() + + " --> " + target, e); + } + incrementCounter(context, Counter.BYTESEXPECTED, sourceFileStatus.getLen()); + incrementCounter(context, Counter.BYTESCOPIED, bytesCopied); + incrementCounter(context, Counter.COPY, 1); + } + + private void createTargetDirsWithRetry(String description, + Path target, Context context) throws IOException { + try { + new RetriableDirectoryCreateCommand(description).execute(target, context); + } catch (Exception e) { + throw new IOException("mkdir failed for " + target, e); + } + incrementCounter(context, Counter.COPY, 1); + } + + private static void updateSkipCounters(Context context, + FileStatus sourceFile) { + incrementCounter(context, Counter.SKIP, 1); + incrementCounter(context, Counter.BYTESSKIPPED, sourceFile.getLen()); + + } + + private void handleFailures(IOException exception, + FileStatus sourceFileStatus, Path target, + Context context) throws IOException, InterruptedException { + LOG.error("Failure in copying " + sourceFileStatus.getPath() + " to " + + target, exception); + + if (ignoreFailures && exception.getCause() instanceof + RetriableFileCopyCommand.CopyReadException) { + incrementCounter(context, Counter.FAIL, 1); + incrementCounter(context, Counter.BYTESFAILED, sourceFileStatus.getLen()); + context.write(null, new Text("FAIL: " + sourceFileStatus.getPath() + " - " + + StringUtils.stringifyException(exception))); + } + else + throw exception; + } + + private static void incrementCounter(Context context, Counter counter, + long value) { + context.getCounter(counter).increment(value); + } + + private boolean skipFile(FileSystem sourceFS, FileStatus source, Path target) + throws IOException { + return targetFS.exists(target) + && !overWrite + && !mustUpdate(sourceFS, source, target); + } + + private boolean mustUpdate(FileSystem sourceFS, FileStatus source, Path target) + throws IOException { + final FileStatus targetFileStatus = targetFS.getFileStatus(target); + + return syncFolders + && ( + targetFileStatus.getLen() != source.getLen() + || (!skipCrc && + !DistCpUtils.checksumsAreEqual(sourceFS, + source.getPath(), targetFS, target)) + || (source.getBlockSize() != targetFileStatus.getBlockSize() && + preserve.contains(FileAttribute.BLOCKSIZE)) + ); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyOutputFormat.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyOutputFormat.java new file mode 100644 index 00000000000..eb43aa3927b --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/CopyOutputFormat.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.mapreduce.security.TokenCache; +import org.apache.hadoop.tools.DistCpConstants; + +import java.io.IOException; + +/** + * The CopyOutputFormat is the Hadoop OutputFormat used in DistCp. + * It sets up the Job's Configuration (in the Job-Context) with the settings + * for the work-directory, final commit-directory, etc. It also sets the right + * output-committer. + * @param + * @param + */ +public class CopyOutputFormat extends TextOutputFormat { + + /** + * Setter for the working directory for DistCp (where files will be copied + * before they are moved to the final commit-directory.) + * @param job The Job on whose configuration the working-directory is to be set. + * @param workingDirectory The path to use as the working directory. + */ + public static void setWorkingDirectory(Job job, Path workingDirectory) { + job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, + workingDirectory.toString()); + } + + /** + * Setter for the final directory for DistCp (where files copied will be + * moved, atomically.) + * @param job The Job on whose configuration the working-directory is to be set. + * @param commitDirectory The path to use for final commit. + */ + public static void setCommitDirectory(Job job, Path commitDirectory) { + job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, + commitDirectory.toString()); + } + + /** + * Getter for the working directory. + * @param job The Job from whose configuration the working-directory is to + * be retrieved. + * @return The working-directory Path. + */ + public static Path getWorkingDirectory(Job job) { + return getWorkingDirectory(job.getConfiguration()); + } + + private static Path getWorkingDirectory(Configuration conf) { + String workingDirectory = conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH); + if (workingDirectory == null || workingDirectory.isEmpty()) { + return null; + } else { + return new Path(workingDirectory); + } + } + + /** + * Getter for the final commit-directory. + * @param job The Job from whose configuration the commit-directory is to be + * retrieved. + * @return The commit-directory Path. + */ + public static Path getCommitDirectory(Job job) { + return getCommitDirectory(job.getConfiguration()); + } + + private static Path getCommitDirectory(Configuration conf) { + String commitDirectory = conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH); + if (commitDirectory == null || commitDirectory.isEmpty()) { + return null; + } else { + return new Path(commitDirectory); + } + } + + /** @inheritDoc */ + @Override + public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException { + return new CopyCommitter(getOutputPath(context), context); + } + + /** @inheritDoc */ + @Override + public void checkOutputSpecs(JobContext context) throws IOException { + Configuration conf = context.getConfiguration(); + + if (getCommitDirectory(conf) == null) { + throw new IllegalStateException("Commit directory not configured"); + } + + Path workingPath = getWorkingDirectory(conf); + if (workingPath == null) { + throw new IllegalStateException("Working directory not configured"); + } + + // get delegation token for outDir's file system + TokenCache.obtainTokensForNamenodes(context.getCredentials(), + new Path[] {workingPath}, conf); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableDirectoryCreateCommand.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableDirectoryCreateCommand.java new file mode 100644 index 00000000000..1321fb5c3fb --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableDirectoryCreateCommand.java @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred; + +import org.apache.hadoop.tools.util.RetriableCommand; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.mapreduce.Mapper; + +/** + * This class extends Retriable command to implement the creation of directories + * with retries on failure. + */ +public class RetriableDirectoryCreateCommand extends RetriableCommand { + + /** + * Constructor, taking a description of the action. + * @param description Verbose description of the copy operation. + */ + public RetriableDirectoryCreateCommand(String description) { + super(description); + } + + /** + * Implementation of RetriableCommand::doExecute(). + * This implements the actual mkdirs() functionality. + * @param arguments Argument-list to the command. + * @return Boolean. True, if the directory could be created successfully. + * @throws Exception IOException, on failure to create the directory. + */ + @Override + protected Object doExecute(Object... arguments) throws Exception { + assert arguments.length == 2 : "Unexpected argument list."; + Path target = (Path)arguments[0]; + Mapper.Context context = (Mapper.Context)arguments[1]; + + FileSystem targetFS = target.getFileSystem(context.getConfiguration()); + return targetFS.mkdirs(target); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java new file mode 100644 index 00000000000..7ba26ff4813 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/RetriableFileCopyCommand.java @@ -0,0 +1,245 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred; + +import org.apache.hadoop.tools.util.RetriableCommand; +import org.apache.hadoop.tools.util.ThrottledInputStream; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.apache.hadoop.tools.DistCpOptions.*; +import org.apache.hadoop.tools.DistCpConstants; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.io.*; +import java.util.EnumSet; + +/** + * This class extends RetriableCommand to implement the copy of files, + * with retries on failure. + */ +public class RetriableFileCopyCommand extends RetriableCommand { + + private static Log LOG = LogFactory.getLog(RetriableFileCopyCommand.class); + private static int BUFFER_SIZE = 8 * 1024; + + /** + * Constructor, taking a description of the action. + * @param description Verbose description of the copy operation. + */ + public RetriableFileCopyCommand(String description) { + super(description); + } + + /** + * Implementation of RetriableCommand::doExecute(). + * This is the actual copy-implementation. + * @param arguments Argument-list to the command. + * @return Number of bytes copied. + * @throws Exception: CopyReadException, if there are read-failures. All other + * failures are IOExceptions. + */ + @SuppressWarnings("unchecked") + @Override + protected Object doExecute(Object... arguments) throws Exception { + assert arguments.length == 4 : "Unexpected argument list."; + FileStatus source = (FileStatus)arguments[0]; + assert !source.isDirectory() : "Unexpected file-status. Expected file."; + Path target = (Path)arguments[1]; + Mapper.Context context = (Mapper.Context)arguments[2]; + EnumSet fileAttributes + = (EnumSet)arguments[3]; + return doCopy(source, target, context, fileAttributes); + } + + private long doCopy(FileStatus sourceFileStatus, Path target, + Mapper.Context context, + EnumSet fileAttributes) + throws IOException { + + Path tmpTargetPath = getTmpFile(target, context); + final Configuration configuration = context.getConfiguration(); + FileSystem targetFS = target.getFileSystem(configuration); + + try { + if (LOG.isDebugEnabled()) { + LOG.debug("Copying " + sourceFileStatus.getPath() + " to " + target); + LOG.debug("Tmp-file path: " + tmpTargetPath); + } + FileSystem sourceFS = sourceFileStatus.getPath().getFileSystem( + configuration); + long bytesRead = copyToTmpFile(tmpTargetPath, targetFS, sourceFileStatus, + context, fileAttributes); + + compareFileLengths(sourceFileStatus, tmpTargetPath, configuration, bytesRead); + compareCheckSums(sourceFS, sourceFileStatus.getPath(), targetFS, tmpTargetPath); + promoteTmpToTarget(tmpTargetPath, target, targetFS); + return bytesRead; + + } finally { + if (targetFS.exists(tmpTargetPath)) + targetFS.delete(tmpTargetPath, false); + } + } + + private long copyToTmpFile(Path tmpTargetPath, FileSystem targetFS, + FileStatus sourceFileStatus, Mapper.Context context, + EnumSet fileAttributes) + throws IOException { + OutputStream outStream = new BufferedOutputStream(targetFS.create( + tmpTargetPath, true, BUFFER_SIZE, + getReplicationFactor(fileAttributes, sourceFileStatus, targetFS), + getBlockSize(fileAttributes, sourceFileStatus, targetFS), context)); + return copyBytes(sourceFileStatus, outStream, BUFFER_SIZE, true, context); + } + + private void compareFileLengths(FileStatus sourceFileStatus, Path target, + Configuration configuration, long bytesRead) + throws IOException { + final Path sourcePath = sourceFileStatus.getPath(); + FileSystem fs = sourcePath.getFileSystem(configuration); + if (fs.getFileStatus(sourcePath).getLen() != bytesRead) + throw new IOException("Mismatch in length of source:" + sourcePath + + " and target:" + target); + } + + private void compareCheckSums(FileSystem sourceFS, Path source, + FileSystem targetFS, Path target) + throws IOException { + if (!DistCpUtils.checksumsAreEqual(sourceFS, source, targetFS, target)) + throw new IOException("Check-sum mismatch between " + + source + " and " + target); + + } + + //If target file exists and unable to delete target - fail + //If target doesn't exist and unable to create parent folder - fail + //If target is successfully deleted and parent exists, if rename fails - fail + private void promoteTmpToTarget(Path tmpTarget, Path target, FileSystem fs) + throws IOException { + if ((fs.exists(target) && !fs.delete(target, false)) + || (!fs.exists(target.getParent()) && !fs.mkdirs(target.getParent())) + || !fs.rename(tmpTarget, target)) { + throw new IOException("Failed to promote tmp-file:" + tmpTarget + + " to: " + target); + } + } + + private Path getTmpFile(Path target, Mapper.Context context) { + Path targetWorkPath = new Path(context.getConfiguration(). + get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH)); + + Path root = target.equals(targetWorkPath)? targetWorkPath.getParent() : targetWorkPath; + LOG.info("Creating temp file: " + + new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString())); + return new Path(root, ".distcp.tmp." + context.getTaskAttemptID().toString()); + } + + private long copyBytes(FileStatus sourceFileStatus, OutputStream outStream, + int bufferSize, boolean mustCloseStream, + Mapper.Context context) throws IOException { + Path source = sourceFileStatus.getPath(); + byte buf[] = new byte[bufferSize]; + ThrottledInputStream inStream = null; + long totalBytesRead = 0; + + try { + inStream = getInputStream(source, context.getConfiguration()); + int bytesRead = readBytes(inStream, buf); + while (bytesRead >= 0) { + totalBytesRead += bytesRead; + outStream.write(buf, 0, bytesRead); + updateContextStatus(totalBytesRead, context, sourceFileStatus); + bytesRead = inStream.read(buf); + } + } finally { + if (mustCloseStream) + IOUtils.cleanup(LOG, outStream, inStream); + } + + return totalBytesRead; + } + + private void updateContextStatus(long totalBytesRead, Mapper.Context context, + FileStatus sourceFileStatus) { + StringBuilder message = new StringBuilder(DistCpUtils.getFormatter() + .format(totalBytesRead * 100.0f / sourceFileStatus.getLen())); + message.append("% ") + .append(description).append(" [") + .append(DistCpUtils.getStringDescriptionFor(totalBytesRead)) + .append('/') + .append(DistCpUtils.getStringDescriptionFor(sourceFileStatus.getLen())) + .append(']'); + context.setStatus(message.toString()); + } + + private static int readBytes(InputStream inStream, byte buf[]) + throws IOException { + try { + return inStream.read(buf); + } + catch (IOException e) { + throw new CopyReadException(e); + } + } + + private static ThrottledInputStream getInputStream(Path path, Configuration conf) + throws IOException { + try { + FileSystem fs = path.getFileSystem(conf); + long bandwidthMB = conf.getInt(DistCpConstants.CONF_LABEL_BANDWIDTH_MB, + DistCpConstants.DEFAULT_BANDWIDTH_MB); + return new ThrottledInputStream(new BufferedInputStream(fs.open(path)), + bandwidthMB * 1024 * 1024); + } + catch (IOException e) { + throw new CopyReadException(e); + } + } + + private static short getReplicationFactor( + EnumSet fileAttributes, + FileStatus sourceFile, FileSystem targetFS) { + return fileAttributes.contains(FileAttribute.REPLICATION)? + sourceFile.getReplication() : targetFS.getDefaultReplication(); + } + + private static long getBlockSize( + EnumSet fileAttributes, + FileStatus sourceFile, FileSystem targetFS) { + return fileAttributes.contains(FileAttribute.BLOCKSIZE)? + sourceFile.getBlockSize() : targetFS.getDefaultBlockSize(); + } + + /** + * Special subclass of IOException. This is used to distinguish read-operation + * failures from other kinds of IOExceptions. + * The failure to read from source is dealt with specially, in the CopyMapper. + * Such failures may be skipped if the DistCpOptions indicate so. + * Write failures are intolerable, and amount to CopyMapper failure. + */ + public static class CopyReadException extends IOException { + public CopyReadException(Throwable rootCause) { + super(rootCause); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/UniformSizeInputFormat.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/UniformSizeInputFormat.java new file mode 100644 index 00000000000..fd6c0d04611 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/UniformSizeInputFormat.java @@ -0,0 +1,169 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.tools.DistCpConstants; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.conf.Configuration; + +import java.io.IOException; +import java.util.List; +import java.util.ArrayList; + +/** + * UniformSizeInputFormat extends the InputFormat<> class, to produce + * input-splits for DistCp. + * It looks at the copy-listing and groups the contents into input-splits such + * that the total-number of bytes to be copied for each input split is + * uniform. + */ +public class UniformSizeInputFormat extends InputFormat { + private static final Log LOG + = LogFactory.getLog(UniformSizeInputFormat.class); + + /** + * Implementation of InputFormat::getSplits(). Returns a list of InputSplits, + * such that the number of bytes to be copied for all the splits are + * approximately equal. + * @param context JobContext for the job. + * @return The list of uniformly-distributed input-splits. + * @throws IOException: On failure. + * @throws InterruptedException + */ + @Override + public List getSplits(JobContext context) + throws IOException, InterruptedException { + Configuration configuration = context.getConfiguration(); + int numSplits = DistCpUtils.getInt(configuration, + JobContext.NUM_MAPS); + + if (numSplits == 0) return new ArrayList(); + + return getSplits(configuration, numSplits, + DistCpUtils.getLong(configuration, + DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED)); + } + + private List getSplits(Configuration configuration, int numSplits, + long totalSizeBytes) throws IOException { + List splits = new ArrayList(numSplits); + long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits); + + FileStatus srcFileStatus = new FileStatus(); + Text srcRelPath = new Text(); + long currentSplitSize = 0; + long lastSplitStart = 0; + long lastPosition = 0; + + final Path listingFilePath = getListingFilePath(configuration); + + if (LOG.isDebugEnabled()) { + LOG.debug("Average bytes per map: " + nBytesPerSplit + + ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes); + } + SequenceFile.Reader reader=null; + try { + reader = getListingFileReader(configuration); + while (reader.next(srcRelPath, srcFileStatus)) { + // If adding the current file would cause the bytes per map to exceed + // limit. Add the current file to new split + if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) { + FileSplit split = new FileSplit(listingFilePath, lastSplitStart, + lastPosition - lastSplitStart, null); + if (LOG.isDebugEnabled()) { + LOG.debug ("Creating split : " + split + ", bytes in split: " + currentSplitSize); + } + splits.add(split); + lastSplitStart = lastPosition; + currentSplitSize = 0; + } + currentSplitSize += srcFileStatus.getLen(); + lastPosition = reader.getPosition(); + } + if (lastPosition > lastSplitStart) { + FileSplit split = new FileSplit(listingFilePath, lastSplitStart, + lastPosition - lastSplitStart, null); + if (LOG.isDebugEnabled()) { + LOG.info ("Creating split : " + split + ", bytes in split: " + currentSplitSize); + } + splits.add(split); + } + + } finally { + IOUtils.closeStream(reader); + } + + return splits; + } + + private static Path getListingFilePath(Configuration configuration) { + final String listingFilePathString = + configuration.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, ""); + + assert !listingFilePathString.equals("") + : "Couldn't find listing file. Invalid input."; + return new Path(listingFilePathString); + } + + private SequenceFile.Reader getListingFileReader(Configuration configuration) { + + final Path listingFilePath = getListingFilePath(configuration); + try { + final FileSystem fileSystem = listingFilePath.getFileSystem(configuration); + if (!fileSystem.exists(listingFilePath)) + throw new IllegalArgumentException("Listing file doesn't exist at: " + + listingFilePath); + + return new SequenceFile.Reader(configuration, + SequenceFile.Reader.file(listingFilePath)); + } + catch (IOException exception) { + LOG.error("Couldn't find listing file at: " + listingFilePath, exception); + throw new IllegalArgumentException("Couldn't find listing-file at: " + + listingFilePath, exception); + } + } + + /** + * Implementation of InputFormat::createRecordReader(). + * @param split The split for which the RecordReader is sought. + * @param context The context of the current task-attempt. + * @return A SequenceFileRecordReader instance, (since the copy-listing is a + * simple sequence-file.) + * @throws IOException + * @throws InterruptedException + */ + @Override + public RecordReader createRecordReader(InputSplit split, + TaskAttemptContext context) + throws IOException, InterruptedException { + return new SequenceFileRecordReader(); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputChunk.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputChunk.java new file mode 100644 index 00000000000..8b188286ee8 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputChunk.java @@ -0,0 +1,246 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.tools.mapred.lib; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.tools.DistCpConstants; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskID; + +import java.io.IOException; + +/** + * The DynamicInputChunk represents a single chunk of work, when used in + * conjunction with the DynamicInputFormat and the DynamicRecordReader. + * The records in the DynamicInputFormat's input-file are split across various + * DynamicInputChunks. Each one is claimed and processed in an iteration of + * a dynamic-mapper. When a DynamicInputChunk has been exhausted, the faster + * mapper may claim another and process it, until there are no more to be + * consumed. + */ +class DynamicInputChunk { + private static Log LOG = LogFactory.getLog(DynamicInputChunk.class); + + private static Configuration configuration; + private static Path chunkRootPath; + private static String chunkFilePrefix; + private static int numChunksLeft = -1; // Un-initialized before 1st dir-scan. + private static FileSystem fs; + + private Path chunkFilePath; + private SequenceFileRecordReader reader; + private SequenceFile.Writer writer; + + private static void initializeChunkInvariants(Configuration config) + throws IOException { + configuration = config; + Path listingFilePath = new Path(getListingFilePath(configuration)); + chunkRootPath = new Path(listingFilePath.getParent(), "chunkDir"); + fs = chunkRootPath.getFileSystem(configuration); + chunkFilePrefix = listingFilePath.getName() + ".chunk."; + } + + private static String getListingFilePath(Configuration configuration) { + final String listingFileString = configuration.get( + DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, ""); + assert !listingFileString.equals("") : "Listing file not found."; + return listingFileString; + } + + private static boolean areInvariantsInitialized() { + return chunkRootPath != null; + } + + private DynamicInputChunk(String chunkId, Configuration configuration) + throws IOException { + if (!areInvariantsInitialized()) + initializeChunkInvariants(configuration); + + chunkFilePath = new Path(chunkRootPath, chunkFilePrefix + chunkId); + openForWrite(); + } + + + private void openForWrite() throws IOException { + writer = SequenceFile.createWriter( + chunkFilePath.getFileSystem(configuration), configuration, + chunkFilePath, Text.class, FileStatus.class, + SequenceFile.CompressionType.NONE); + + } + + /** + * Factory method to create chunk-files for writing to. + * (For instance, when the DynamicInputFormat splits the input-file into + * chunks.) + * @param chunkId String to identify the chunk. + * @param configuration Configuration, describing the location of the listing- + * file, file-system for the map-job, etc. + * @return A DynamicInputChunk, corresponding to a chunk-file, with the name + * incorporating the chunk-id. + * @throws IOException Exception on failure to create the chunk. + */ + public static DynamicInputChunk createChunkForWrite(String chunkId, + Configuration configuration) throws IOException { + return new DynamicInputChunk(chunkId, configuration); + } + + /** + * Method to write records into a chunk. + * @param key Key from the listing file. + * @param value Corresponding value from the listing file. + * @throws IOException Exception onf failure to write to the file. + */ + public void write(Text key, FileStatus value) throws IOException { + writer.append(key, value); + } + + /** + * Closes streams opened to the chunk-file. + */ + public void close() { + IOUtils.cleanup(LOG, reader, writer); + } + + /** + * Reassigns the chunk to a specified Map-Task, for consumption. + * @param taskId The Map-Task to which a the chunk is to be reassigned. + * @throws IOException Exception on failure to reassign. + */ + public void assignTo(TaskID taskId) throws IOException { + Path newPath = new Path(chunkRootPath, taskId.toString()); + if (!fs.rename(chunkFilePath, newPath)) { + LOG.warn(chunkFilePath + " could not be assigned to " + taskId); + } + } + + private DynamicInputChunk(Path chunkFilePath, + TaskAttemptContext taskAttemptContext) + throws IOException, InterruptedException { + if (!areInvariantsInitialized()) + initializeChunkInvariants(taskAttemptContext.getConfiguration()); + + this.chunkFilePath = chunkFilePath; + openForRead(taskAttemptContext); + } + + private void openForRead(TaskAttemptContext taskAttemptContext) + throws IOException, InterruptedException { + reader = new SequenceFileRecordReader(); + reader.initialize(new FileSplit(chunkFilePath, 0, + DistCpUtils.getFileSize(chunkFilePath, configuration), null), + taskAttemptContext); + } + + /** + * Factory method that + * 1. acquires a chunk for the specified map-task attempt + * 2. returns a DynamicInputChunk associated with the acquired chunk-file. + * @param taskAttemptContext The attempt-context for the map task that's + * trying to acquire a chunk. + * @return The acquired dynamic-chunk. The chunk-file is renamed to the + * attempt-id (from the attempt-context.) + * @throws IOException Exception on failure. + * @throws InterruptedException Exception on failure. + */ + public static DynamicInputChunk acquire(TaskAttemptContext taskAttemptContext) + throws IOException, InterruptedException { + if (!areInvariantsInitialized()) + initializeChunkInvariants(taskAttemptContext.getConfiguration()); + + String taskId + = taskAttemptContext.getTaskAttemptID().getTaskID().toString(); + Path acquiredFilePath = new Path(chunkRootPath, taskId); + + if (fs.exists(acquiredFilePath)) { + LOG.info("Acquiring pre-assigned chunk: " + acquiredFilePath); + return new DynamicInputChunk(acquiredFilePath, taskAttemptContext); + } + + for (FileStatus chunkFile : getListOfChunkFiles()) { + if (fs.rename(chunkFile.getPath(), acquiredFilePath)) { + LOG.info(taskId + " acquired " + chunkFile.getPath()); + return new DynamicInputChunk(acquiredFilePath, taskAttemptContext); + } + else + LOG.warn(taskId + " could not acquire " + chunkFile.getPath()); + } + + return null; + } + + /** + * Method to be called to relinquish an acquired chunk. All streams open to + * the chunk are closed, and the chunk-file is deleted. + * @throws IOException Exception thrown on failure to release (i.e. delete) + * the chunk file. + */ + public void release() throws IOException { + close(); + if (!fs.delete(chunkFilePath, false)) { + LOG.error("Unable to release chunk at path: " + chunkFilePath); + throw new IOException("Unable to release chunk at path: " + chunkFilePath); + } + } + + static FileStatus [] getListOfChunkFiles() throws IOException { + Path chunkFilePattern = new Path(chunkRootPath, chunkFilePrefix + "*"); + FileStatus chunkFiles[] = fs.globStatus(chunkFilePattern); + numChunksLeft = chunkFiles.length; + return chunkFiles; + } + + /** + * Getter for the chunk-file's path, on HDFS. + * @return The qualified path to the chunk-file. + */ + public Path getPath() { + return chunkFilePath; + } + + /** + * Getter for the record-reader, opened to the chunk-file. + * @return Opened Sequence-file reader. + */ + public SequenceFileRecordReader getReader() { + assert reader != null : "Reader un-initialized!"; + return reader; + } + + /** + * Getter for the number of chunk-files left in the chunk-file directory. + * Useful to determine how many chunks (and hence, records) are left to be + * processed. + * @return Before the first scan of the directory, the number returned is -1. + * Otherwise, the number of chunk-files seen from the last scan is returned. + */ + public static int getNumChunksLeft() { + return numChunksLeft; + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputFormat.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputFormat.java new file mode 100644 index 00000000000..e1ae9f90f57 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicInputFormat.java @@ -0,0 +1,292 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred.lib; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.tools.DistCpConstants; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileStatus; + +import java.util.List; +import java.util.ArrayList; +import java.io.IOException; + +/** + * DynamicInputFormat implements the "Worker pattern" for DistCp. + * Rather than to split up the copy-list into a set of static splits, + * the DynamicInputFormat does the following: + * 1. Splits the copy-list into small chunks on the DFS. + * 2. Creates a set of empty "dynamic" splits, that each consume as many chunks + * as it can. + * This arrangement ensures that a single slow mapper won't slow down the entire + * job (since the slack will be picked up by other mappers, who consume more + * chunks.) + * By varying the split-ratio, one can vary chunk sizes to achieve different + * performance characteristics. + */ +public class DynamicInputFormat extends InputFormat { + private static final Log LOG = LogFactory.getLog(DynamicInputFormat.class); + + private static final String CONF_LABEL_LISTING_SPLIT_RATIO + = "mapred.listing.split.ratio"; + private static final String CONF_LABEL_NUM_SPLITS + = "mapred.num.splits"; + private static final String CONF_LABEL_NUM_ENTRIES_PER_CHUNK + = "mapred.num.entries.per.chunk"; + + /** + * Implementation of InputFormat::getSplits(). This method splits up the + * copy-listing file into chunks, and assigns the first batch to different + * tasks. + * @param jobContext JobContext for the map job. + * @return The list of (empty) dynamic input-splits. + * @throws IOException, on failure. + * @throws InterruptedException + */ + @Override + public List getSplits(JobContext jobContext) + throws IOException, InterruptedException { + LOG.info("DynamicInputFormat: Getting splits for job:" + + jobContext.getJobID()); + return createSplits(jobContext, + splitCopyListingIntoChunksWithShuffle(jobContext)); + } + + private List createSplits(JobContext jobContext, + List chunks) + throws IOException { + int numMaps = getNumMapTasks(jobContext.getConfiguration()); + + final int nSplits = Math.min(numMaps, chunks.size()); + List splits = new ArrayList(nSplits); + + for (int i=0; i< nSplits; ++i) { + TaskID taskId = new TaskID(jobContext.getJobID(), TaskType.MAP, i); + chunks.get(i).assignTo(taskId); + splits.add(new FileSplit(chunks.get(i).getPath(), 0, + // Setting non-zero length for FileSplit size, to avoid a possible + // future when 0-sized file-splits are considered "empty" and skipped + // over. + MIN_RECORDS_PER_CHUNK, + null)); + } + DistCpUtils.publish(jobContext.getConfiguration(), + CONF_LABEL_NUM_SPLITS, splits.size()); + return splits; + } + + private static int N_CHUNKS_OPEN_AT_ONCE_DEFAULT = 16; + + private List splitCopyListingIntoChunksWithShuffle + (JobContext context) throws IOException { + + final Configuration configuration = context.getConfiguration(); + int numRecords = getNumberOfRecords(configuration); + int numMaps = getNumMapTasks(configuration); + // Number of chunks each map will process, on average. + int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords); + validateNumChunksUsing(splitRatio, numMaps); + + int numEntriesPerChunk = (int)Math.ceil((float)numRecords + /(splitRatio * numMaps)); + DistCpUtils.publish(context.getConfiguration(), + CONF_LABEL_NUM_ENTRIES_PER_CHUNK, + numEntriesPerChunk); + + final int nChunksTotal = (int)Math.ceil((float)numRecords/numEntriesPerChunk); + int nChunksOpenAtOnce + = Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal); + + Path listingPath = getListingFilePath(configuration); + SequenceFile.Reader reader + = new SequenceFile.Reader(configuration, + SequenceFile.Reader.file(listingPath)); + + List openChunks + = new ArrayList(); + + List chunksFinal = new ArrayList(); + + FileStatus fileStatus = new FileStatus(); + Text relPath = new Text(); + int recordCounter = 0; + int chunkCount = 0; + + try { + + while (reader.next(relPath, fileStatus)) { + if (recordCounter % (nChunksOpenAtOnce*numEntriesPerChunk) == 0) { + // All chunks full. Create new chunk-set. + closeAll(openChunks); + chunksFinal.addAll(openChunks); + + openChunks = createChunks( + configuration, chunkCount, nChunksTotal, nChunksOpenAtOnce); + + chunkCount += openChunks.size(); + + nChunksOpenAtOnce = openChunks.size(); + recordCounter = 0; + } + + // Shuffle into open chunks. + openChunks.get(recordCounter%nChunksOpenAtOnce).write(relPath, fileStatus); + ++recordCounter; + } + + } finally { + closeAll(openChunks); + chunksFinal.addAll(openChunks); + IOUtils.closeStream(reader); + } + + LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size()); + return chunksFinal; + } + + private static void validateNumChunksUsing(int splitRatio, int numMaps) + throws IOException { + if (splitRatio * numMaps > MAX_CHUNKS_TOLERABLE) + throw new IOException("Too many chunks created with splitRatio:" + + splitRatio + ", numMaps:" + numMaps + + ". Reduce numMaps or decrease split-ratio to proceed."); + } + + private static void closeAll(List chunks) { + for (DynamicInputChunk chunk: chunks) + chunk.close(); + } + + private static List createChunks(Configuration config, + int chunkCount, int nChunksTotal, int nChunksOpenAtOnce) + throws IOException { + List chunks = new ArrayList(); + int chunkIdUpperBound + = Math.min(nChunksTotal, chunkCount + nChunksOpenAtOnce); + + // If there will be fewer than nChunksOpenAtOnce chunks left after + // the current batch of chunks, fold the remaining chunks into + // the current batch. + if (nChunksTotal - chunkIdUpperBound < nChunksOpenAtOnce) + chunkIdUpperBound = nChunksTotal; + + for (int i=chunkCount; i < chunkIdUpperBound; ++i) + chunks.add(createChunk(i, config)); + return chunks; + } + + private static DynamicInputChunk createChunk(int chunkId, Configuration config) + throws IOException { + return DynamicInputChunk.createChunkForWrite(String.format("%05d", chunkId), + config); + } + + + private static Path getListingFilePath(Configuration configuration) { + String listingFilePathString = configuration.get( + DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, ""); + + assert !listingFilePathString.equals("") : "Listing file not found."; + + Path listingFilePath = new Path(listingFilePathString); + try { + assert listingFilePath.getFileSystem(configuration) + .exists(listingFilePath) : "Listing file: " + listingFilePath + + " not found."; + } catch (IOException e) { + assert false : "Listing file: " + listingFilePath + + " couldn't be accessed. " + e.getMessage(); + } + return listingFilePath; + } + + private static int getNumberOfRecords(Configuration configuration) { + return DistCpUtils.getInt(configuration, + DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS); + } + + private static int getNumMapTasks(Configuration configuration) { + return DistCpUtils.getInt(configuration, + JobContext.NUM_MAPS); + } + + private static int getListingSplitRatio(Configuration configuration, + int numMaps, int numPaths) { + return configuration.getInt( + CONF_LABEL_LISTING_SPLIT_RATIO, + getSplitRatio(numMaps, numPaths)); + } + + private static final int MAX_CHUNKS_TOLERABLE = 400; + private static final int MAX_CHUNKS_IDEAL = 100; + private static final int MIN_RECORDS_PER_CHUNK = 5; + private static final int SPLIT_RATIO_DEFAULT = 2; + + /** + * Package private, for testability. + * @param nMaps The number of maps requested for. + * @param nRecords The number of records to be copied. + * @return The number of splits each map should handle, ideally. + */ + static int getSplitRatio(int nMaps, int nRecords) { + if (nMaps == 1) { + LOG.warn("nMaps == 1. Why use DynamicInputFormat?"); + return 1; + } + + if (nMaps > MAX_CHUNKS_IDEAL) + return SPLIT_RATIO_DEFAULT; + + int nPickups = (int)Math.ceil((float)MAX_CHUNKS_IDEAL/nMaps); + int nRecordsPerChunk = (int)Math.ceil((float)nRecords/(nMaps*nPickups)); + + return nRecordsPerChunk < MIN_RECORDS_PER_CHUNK ? + SPLIT_RATIO_DEFAULT : nPickups; + } + + static int getNumEntriesPerChunk(Configuration configuration) { + return DistCpUtils.getInt(configuration, + CONF_LABEL_NUM_ENTRIES_PER_CHUNK); + } + + + /** + * Implementation of Inputformat::createRecordReader(). + * @param inputSplit The split for which the RecordReader is required. + * @param taskAttemptContext TaskAttemptContext for the current attempt. + * @return DynamicRecordReader instance. + * @throws IOException, on failure. + * @throws InterruptedException + */ + @Override + public RecordReader createRecordReader( + InputSplit inputSplit, + TaskAttemptContext taskAttemptContext) + throws IOException, InterruptedException { + return new DynamicRecordReader(); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicRecordReader.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicRecordReader.java new file mode 100644 index 00000000000..40d75f44713 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/mapred/lib/DynamicRecordReader.java @@ -0,0 +1,203 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred.lib; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.apache.hadoop.tools.DistCpConstants; +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.conf.Configuration; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; + +/** + * The DynamicRecordReader is used in conjunction with the DynamicInputFormat + * to implement the "Worker pattern" for DistCp. + * The DynamicRecordReader is responsible for: + * 1. Presenting the contents of each chunk to DistCp's mapper. + * 2. Acquiring a new chunk when the current chunk has been completely consumed, + * transparently. + */ +public class DynamicRecordReader extends RecordReader { + private static final Log LOG = LogFactory.getLog(DynamicRecordReader.class); + private TaskAttemptContext taskAttemptContext; + private Configuration configuration; + private DynamicInputChunk chunk; + private TaskID taskId; + + // Data required for progress indication. + private int numRecordsPerChunk; // Constant per job. + private int totalNumRecords; // Constant per job. + private int numRecordsProcessedByThisMap = 0; + private long timeOfLastChunkDirScan = 0; + private boolean isChunkDirAlreadyScanned = false; + + private static long TIME_THRESHOLD_FOR_DIR_SCANS = TimeUnit.MINUTES.toMillis(5); + + /** + * Implementation for RecordReader::initialize(). Initializes the internal + * RecordReader to read from chunks. + * @param inputSplit The InputSplit for the map. Ignored entirely. + * @param taskAttemptContext The AttemptContext. + * @throws IOException, on failure. + * @throws InterruptedException + */ + @Override + public void initialize(InputSplit inputSplit, + TaskAttemptContext taskAttemptContext) + throws IOException, InterruptedException { + numRecordsPerChunk = DynamicInputFormat.getNumEntriesPerChunk( + taskAttemptContext.getConfiguration()); + this.taskAttemptContext = taskAttemptContext; + configuration = taskAttemptContext.getConfiguration(); + taskId = taskAttemptContext.getTaskAttemptID().getTaskID(); + chunk = DynamicInputChunk.acquire(this.taskAttemptContext); + timeOfLastChunkDirScan = System.currentTimeMillis(); + isChunkDirAlreadyScanned = false; + + totalNumRecords = getTotalNumRecords(); + + } + + private int getTotalNumRecords() { + return DistCpUtils.getInt(configuration, + DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS); + } + + /** + * Implementation of RecordReader::nextValue(). + * Reads the contents of the current chunk and returns them. When a chunk has + * been completely exhausted, an new chunk is acquired and read, + * transparently. + * @return True, if the nextValue() could be traversed to. False, otherwise. + * @throws IOException, on failure. + * @throws InterruptedException + */ + @Override + public boolean nextKeyValue() + throws IOException, InterruptedException { + + if (chunk == null) { + if (LOG.isDebugEnabled()) + LOG.debug(taskId + ": RecordReader is null. No records to be read."); + return false; + } + + if (chunk.getReader().nextKeyValue()) { + ++numRecordsProcessedByThisMap; + return true; + } + + if (LOG.isDebugEnabled()) + LOG.debug(taskId + ": Current chunk exhausted. " + + " Attempting to pick up new one."); + + chunk.release(); + timeOfLastChunkDirScan = System.currentTimeMillis(); + isChunkDirAlreadyScanned = false; + + chunk = DynamicInputChunk.acquire(taskAttemptContext); + + if (chunk == null) return false; + + if (chunk.getReader().nextKeyValue()) { + ++numRecordsProcessedByThisMap; + return true; + } + else { + return false; + } + } + + /** + * Implementation of RecordReader::getCurrentKey(). + * @return The key of the current record. (i.e. the source-path.) + * @throws IOException, on failure. + * @throws InterruptedException + */ + @Override + public K getCurrentKey() + throws IOException, InterruptedException { + return chunk.getReader().getCurrentKey(); + } + + /** + * Implementation of RecordReader::getCurrentValue(). + * @return The value of the current record. (i.e. the target-path.) + * @throws IOException, on failure. + * @throws InterruptedException + */ + @Override + public V getCurrentValue() + throws IOException, InterruptedException { + return chunk.getReader().getCurrentValue(); + } + + /** + * Implementation of RecordReader::getProgress(). + * @return A fraction [0.0,1.0] indicating the progress of a DistCp mapper. + * @throws IOException, on failure. + * @throws InterruptedException + */ + @Override + public float getProgress() + throws IOException, InterruptedException { + final int numChunksLeft = getNumChunksLeft(); + if (numChunksLeft < 0) {// Un-initialized. i.e. Before 1st dir-scan. + assert numRecordsProcessedByThisMap <= numRecordsPerChunk + : "numRecordsProcessedByThisMap:" + numRecordsProcessedByThisMap + + " exceeds numRecordsPerChunk:" + numRecordsPerChunk; + return ((float) numRecordsProcessedByThisMap) / totalNumRecords; + // Conservative estimate, till the first directory scan. + } + + return ((float) numRecordsProcessedByThisMap) + /(numRecordsProcessedByThisMap + numRecordsPerChunk*numChunksLeft); + } + + private int getNumChunksLeft() throws IOException { + long now = System.currentTimeMillis(); + boolean tooLongSinceLastDirScan + = now - timeOfLastChunkDirScan > TIME_THRESHOLD_FOR_DIR_SCANS; + + if (tooLongSinceLastDirScan + || (!isChunkDirAlreadyScanned && + numRecordsProcessedByThisMap%numRecordsPerChunk + > numRecordsPerChunk/2)) { + DynamicInputChunk.getListOfChunkFiles(); + isChunkDirAlreadyScanned = true; + timeOfLastChunkDirScan = now; + } + + return DynamicInputChunk.getNumChunksLeft(); + } + /** + * Implementation of RecordReader::close(). + * Closes the RecordReader. + * @throws IOException, on failure. + */ + @Override + public void close() + throws IOException { + if (chunk != null) + chunk.close(); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java new file mode 100644 index 00000000000..24c2c947e66 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java @@ -0,0 +1,343 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.util; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileChecksum; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.tools.DistCpOptions.FileAttribute; +import org.apache.hadoop.tools.mapred.UniformSizeInputFormat; +import org.apache.hadoop.tools.DistCpOptions; +import org.apache.hadoop.mapreduce.InputFormat; + +import java.io.IOException; +import java.util.EnumSet; +import java.util.Locale; +import java.text.DecimalFormat; +import java.net.URI; +import java.net.InetAddress; +import java.net.UnknownHostException; + +/** + * Utility functions used in DistCp. + */ +public class DistCpUtils { + + private static final Log LOG = LogFactory.getLog(DistCpUtils.class); + + /** + * Retrieves size of the file at the specified path. + * @param path The path of the file whose size is sought. + * @param configuration Configuration, to retrieve the appropriate FileSystem. + * @return The file-size, in number of bytes. + * @throws IOException, on failure. + */ + public static long getFileSize(Path path, Configuration configuration) + throws IOException { + if (LOG.isDebugEnabled()) + LOG.debug("Retrieving file size for: " + path); + return path.getFileSystem(configuration).getFileStatus(path).getLen(); + } + + /** + * Utility to publish a value to a configuration. + * @param configuration The Configuration to which the value must be written. + * @param label The label for the value being published. + * @param value The value being published. + * @param The type of the value. + */ + public static void publish(Configuration configuration, + String label, T value) { + configuration.set(label, String.valueOf(value)); + } + + /** + * Utility to retrieve a specified key from a Configuration. Throw exception + * if not found. + * @param configuration The Configuration in which the key is sought. + * @param label The key being sought. + * @return Integer value of the key. + */ + public static int getInt(Configuration configuration, String label) { + int value = configuration.getInt(label, -1); + assert value >= 0 : "Couldn't find " + label; + return value; + } + + /** + * Utility to retrieve a specified key from a Configuration. Throw exception + * if not found. + * @param configuration The Configuration in which the key is sought. + * @param label The key being sought. + * @return Long value of the key. + */ + public static long getLong(Configuration configuration, String label) { + long value = configuration.getLong(label, -1); + assert value >= 0 : "Couldn't find " + label; + return value; + } + + /** + * Returns the class that implements a copy strategy. Looks up the implementation for + * a particular strategy from distcp-default.xml + * + * @param conf - Configuration object + * @param options - Handle to input options + * @return Class implementing the strategy specified in options. + */ + public static Class getStrategy(Configuration conf, + DistCpOptions options) { + String confLabel = "distcp." + + options.getCopyStrategy().toLowerCase(Locale.getDefault()) + ".strategy.impl"; + return conf.getClass(confLabel, UniformSizeInputFormat.class, InputFormat.class); + } + + /** + * Gets relative path of child path with respect to a root path + * For ex. If childPath = /tmp/abc/xyz/file and + * sourceRootPath = /tmp/abc + * Relative path would be /xyz/file + * If childPath = /file and + * sourceRootPath = / + * Relative path would be /file + * @param sourceRootPath - Source root path + * @param childPath - Path for which relative path is required + * @return - Relative portion of the child path (always prefixed with / + * unless it is empty + */ + public static String getRelativePath(Path sourceRootPath, Path childPath) { + String childPathString = childPath.toUri().getPath(); + String sourceRootPathString = sourceRootPath.toUri().getPath(); + return sourceRootPathString.equals("/") ? childPathString : + childPathString.substring(sourceRootPathString.length()); + } + + /** + * Pack file preservation attributes into a string, containing + * just the first character of each preservation attribute + * @param attributes - Attribute set to preserve + * @return - String containing first letters of each attribute to preserve + */ + public static String packAttributes(EnumSet attributes) { + StringBuffer buffer = new StringBuffer(5); + int len = 0; + for (FileAttribute attribute : attributes) { + buffer.append(attribute.name().charAt(0)); + len++; + } + return buffer.substring(0, len); + } + + /** + * Un packs preservation attribute string containing the first character of + * each preservation attribute back to a set of attributes to preserve + * @param attributes - Attribute string + * @return - Attribute set + */ + public static EnumSet unpackAttributes(String attributes) { + EnumSet retValue = EnumSet.noneOf(FileAttribute.class); + + if (attributes != null) { + for (int index = 0; index < attributes.length(); index++) { + retValue.add(FileAttribute.getAttribute(attributes.charAt(index))); + } + } + + return retValue; + } + + /** + * Preserve attribute on file matching that of the file status being sent + * as argument. Barring the block size, all the other attributes are preserved + * by this function + * + * @param targetFS - File system + * @param path - Path that needs to preserve original file status + * @param srcFileStatus - Original file status + * @param attributes - Attribute set that need to be preserved + * @throws IOException - Exception if any (particularly relating to group/owner + * change or any transient error) + */ + public static void preserve(FileSystem targetFS, Path path, + FileStatus srcFileStatus, + EnumSet attributes) throws IOException { + + FileStatus targetFileStatus = targetFS.getFileStatus(path); + String group = targetFileStatus.getGroup(); + String user = targetFileStatus.getOwner(); + boolean chown = false; + + if (attributes.contains(FileAttribute.PERMISSION) && + !srcFileStatus.getPermission().equals(targetFileStatus.getPermission())) { + targetFS.setPermission(path, srcFileStatus.getPermission()); + } + + if (attributes.contains(FileAttribute.REPLICATION) && ! targetFileStatus.isDirectory() && + srcFileStatus.getReplication() != targetFileStatus.getReplication()) { + targetFS.setReplication(path, srcFileStatus.getReplication()); + } + + if (attributes.contains(FileAttribute.GROUP) && + !group.equals(srcFileStatus.getGroup())) { + group = srcFileStatus.getGroup(); + chown = true; + } + + if (attributes.contains(FileAttribute.USER) && + !user.equals(srcFileStatus.getOwner())) { + user = srcFileStatus.getOwner(); + chown = true; + } + + if (chown) { + targetFS.setOwner(path, user, group); + } + } + + /** + * Sort sequence file containing FileStatus and Text as key and value respecitvely + * + * @param fs - File System + * @param conf - Configuration + * @param sourceListing - Source listing file + * @return Path of the sorted file. Is source file with _sorted appended to the name + * @throws IOException - Any exception during sort. + */ + public static Path sortListing(FileSystem fs, Configuration conf, Path sourceListing) + throws IOException { + SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, Text.class, FileStatus.class, conf); + Path output = new Path(sourceListing.toString() + "_sorted"); + + if (fs.exists(output)) { + fs.delete(output, false); + } + + sorter.sort(sourceListing, output); + return output; + } + + /** + * String utility to convert a number-of-bytes to human readable format. + */ + private static ThreadLocal FORMATTER + = new ThreadLocal() { + @Override + protected DecimalFormat initialValue() { + return new DecimalFormat("0.0"); + } + }; + + public static DecimalFormat getFormatter() { + return FORMATTER.get(); + } + + public static String getStringDescriptionFor(long nBytes) { + + char units [] = {'B', 'K', 'M', 'G', 'T', 'P'}; + + double current = nBytes; + double prev = current; + int index = 0; + + while ((current = current/1024) >= 1) { + prev = current; + ++index; + } + + assert index < units.length : "Too large a number."; + + return getFormatter().format(prev) + units[index]; + } + + /** + * Utility to compare checksums for the paths specified. + * + * If checksums's can't be retrieved, it doesn't fail the test + * Only time the comparison would fail is when checksums are + * available and they don't match + * + * @param sourceFS FileSystem for the source path. + * @param source The source path. + * @param targetFS FileSystem for the target path. + * @param target The target path. + * @return If either checksum couldn't be retrieved, the function returns + * false. If checksums are retrieved, the function returns true if they match, + * and false otherwise. + * @throws IOException if there's an exception while retrieving checksums. + */ + public static boolean checksumsAreEqual(FileSystem sourceFS, Path source, + FileSystem targetFS, Path target) + throws IOException { + FileChecksum sourceChecksum = null; + FileChecksum targetChecksum = null; + try { + sourceChecksum = sourceFS.getFileChecksum(source); + targetChecksum = targetFS.getFileChecksum(target); + } catch (IOException e) { + LOG.error("Unable to retrieve checksum for " + source + " or " + target, e); + } + return (sourceChecksum == null || targetChecksum == null || + sourceChecksum.equals(targetChecksum)); + } + + /* see if two file systems are the same or not + * + */ + public static boolean compareFs(FileSystem srcFs, FileSystem destFs) { + URI srcUri = srcFs.getUri(); + URI dstUri = destFs.getUri(); + if (srcUri.getScheme() == null) { + return false; + } + if (!srcUri.getScheme().equals(dstUri.getScheme())) { + return false; + } + String srcHost = srcUri.getHost(); + String dstHost = dstUri.getHost(); + if ((srcHost != null) && (dstHost != null)) { + try { + srcHost = InetAddress.getByName(srcHost).getCanonicalHostName(); + dstHost = InetAddress.getByName(dstHost).getCanonicalHostName(); + } catch(UnknownHostException ue) { + if (LOG.isDebugEnabled()) + LOG.debug("Could not compare file-systems. Unknown host: ", ue); + return false; + } + if (!srcHost.equals(dstHost)) { + return false; + } + } + else if (srcHost == null && dstHost != null) { + return false; + } + else if (srcHost != null) { + return false; + } + + //check for ports + + return srcUri.getPort() == dstUri.getPort(); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java new file mode 100644 index 00000000000..1d248f082a7 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/RetriableCommand.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.hadoop.tools.util; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.retry.RetryPolicy; +import org.apache.hadoop.io.retry.RetryPolicies; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; + +/** + * This class represents commands that be retried on failure, in a configurable + * manner. + */ +public abstract class RetriableCommand { + + private static Log LOG = LogFactory.getLog(RetriableCommand.class); + + private static final long DELAY_MILLISECONDS = 500; + private static final int MAX_RETRIES = 3; + + private RetryPolicy retryPolicy = RetryPolicies. + exponentialBackoffRetry(MAX_RETRIES, DELAY_MILLISECONDS, TimeUnit.MILLISECONDS); + protected String description; + + /** + * Constructor. + * @param description The human-readable description of the command. + */ + public RetriableCommand(String description) { + this.description = description; + } + + /** + * Constructor. + * @param description The human-readable description of the command. + * @param retryPolicy The RetryHandler to be used to compute retries. + */ + public RetriableCommand(String description, RetryPolicy retryPolicy) { + this(description); + setRetryPolicy(retryPolicy); + } + + /** + * Implement this interface-method define the command-logic that will be + * retried on failure (i.e. with Exception). + * @param arguments Argument-list to the command. + * @return Generic "Object". + * @throws Exception Throws Exception on complete failure. + */ + protected abstract Object doExecute(Object... arguments) throws Exception; + + /** + * The execute() method invokes doExecute() until either: + * 1. doExecute() succeeds, or + * 2. the command may no longer be retried (e.g. runs out of retry-attempts). + * @param arguments The list of arguments for the command. + * @return Generic "Object" from doExecute(), on success. + * @throws IOException, IOException, on complete failure. + */ + public Object execute(Object... arguments) throws Exception { + Exception latestException; + int counter = 0; + do { + try { + return doExecute(arguments); + } catch(Exception exception) { + LOG.error("Failure in Retriable command: " + description, exception); + latestException = exception; + } + counter++; + } while (retryPolicy.shouldRetry(latestException, counter, 0, true).equals(RetryPolicy.RetryAction.RETRY)); + + throw new IOException("Couldn't run retriable-command: " + description, + latestException); + } + + /** + * Fluent-interface to change the RetryHandler. + * @param retryHandler The new RetryHandler instance to be used. + * @return Self. + */ + public RetriableCommand setRetryPolicy(RetryPolicy retryHandler) { + this.retryPolicy = retryHandler; + return this; + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/ThrottledInputStream.java b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/ThrottledInputStream.java new file mode 100644 index 00000000000..cf442f4a8ae --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/ThrottledInputStream.java @@ -0,0 +1,139 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.util; + +import java.io.IOException; +import java.io.InputStream; + +/** + * The ThrottleInputStream provides bandwidth throttling on a specified + * InputStream. It is implemented as a wrapper on top of another InputStream + * instance. + * The throttling works by examining the number of bytes read from the underlying + * InputStream from the beginning, and sleep()ing for a time interval if + * the byte-transfer is found exceed the specified tolerable maximum. + * (Thus, while the read-rate might exceed the maximum for a given short interval, + * the average tends towards the specified maximum, overall.) + */ +public class ThrottledInputStream extends InputStream { + + private final InputStream rawStream; + private final long maxBytesPerSec; + private final long startTime = System.currentTimeMillis(); + + private long bytesRead = 0; + private long totalSleepTime = 0; + + private static final long SLEEP_DURATION_MS = 50; + + public ThrottledInputStream(InputStream rawStream) { + this(rawStream, Long.MAX_VALUE); + } + + public ThrottledInputStream(InputStream rawStream, long maxBytesPerSec) { + assert maxBytesPerSec > 0 : "Bandwidth " + maxBytesPerSec + " is invalid"; + this.rawStream = rawStream; + this.maxBytesPerSec = maxBytesPerSec; + } + + /** @inheritDoc */ + @Override + public int read() throws IOException { + throttle(); + int data = rawStream.read(); + if (data != -1) { + bytesRead++; + } + return data; + } + + /** @inheritDoc */ + @Override + public int read(byte[] b) throws IOException { + throttle(); + int readLen = rawStream.read(b); + if (readLen != -1) { + bytesRead += readLen; + } + return readLen; + } + + /** @inheritDoc */ + @Override + public int read(byte[] b, int off, int len) throws IOException { + throttle(); + int readLen = rawStream.read(b, off, len); + if (readLen != -1) { + bytesRead += readLen; + } + return readLen; + } + + private void throttle() throws IOException { + if (getBytesPerSec() > maxBytesPerSec) { + try { + Thread.sleep(SLEEP_DURATION_MS); + totalSleepTime += SLEEP_DURATION_MS; + } catch (InterruptedException e) { + throw new IOException("Thread aborted", e); + } + } + } + + /** + * Getter for the number of bytes read from this stream, since creation. + * @return The number of bytes. + */ + public long getTotalBytesRead() { + return bytesRead; + } + + /** + * Getter for the read-rate from this stream, since creation. + * Calculated as bytesRead/elapsedTimeSinceStart. + * @return Read rate, in bytes/sec. + */ + public long getBytesPerSec() { + long elapsed = (System.currentTimeMillis() - startTime) / 1000; + if (elapsed == 0) { + return bytesRead; + } else { + return bytesRead / elapsed; + } + } + + /** + * Getter the total time spent in sleep. + * @return Number of milliseconds spent in sleep. + */ + public long getTotalSleepTime() { + return totalSleepTime; + } + + /** @inheritDoc */ + @Override + public String toString() { + return "ThrottledInputStream{" + + "bytesRead=" + bytesRead + + ", maxBytesPerSec=" + maxBytesPerSec + + ", bytesPerSec=" + getBytesPerSec() + + ", totalSleepTime=" + totalSleepTime + + '}'; + } +} diff --git a/hadoop-tools/hadoop-distcp/src/main/resources/distcp-default.xml b/hadoop-tools/hadoop-distcp/src/main/resources/distcp-default.xml new file mode 100644 index 00000000000..0d936c9f0c1 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/main/resources/distcp-default.xml @@ -0,0 +1,41 @@ + + + + + + + + + distcp.dynamic.strategy.impl + org.apache.hadoop.tools.mapred.lib.DynamicInputFormat + Implementation of dynamic input format + + + + distcp.static.strategy.impl + org.apache.hadoop.tools.mapred.UniformSizeInputFormat + Implementation of static input format + + + + mapred.job.map.memory.mb + 1024 + + + + mapred.job.reduce.memory.mb + 1024 + + + + mapred.reducer.new-api + true + + + + mapreduce.reduce.class + org.apache.hadoop.mapreduce.Reducer + + + diff --git a/hadoop-tools/hadoop-distcp/src/site/fml/faq.fml b/hadoop-tools/hadoop-distcp/src/site/fml/faq.fml new file mode 100644 index 00000000000..65ef3997ed8 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/site/fml/faq.fml @@ -0,0 +1,98 @@ + + + + + + + General + + + Why does -update not create the parent source-directory under + a pre-existing target directory? + The behaviour of -update and -overwrite + is described in detail in the Usage section of this document. In short, + if either option is used with a pre-existing destination directory, the + contents of each source directory is copied over, rather + than the source-directory itself. + This behaviour is consistent with the legacy DistCp implementation as well. + + + + + How does the new DistCp differ in semantics from the Legacy + DistCp? + +
    +
  • Files that are skipped during copy used to also have their + file-attributes (permissions, owner/group info, etc.) unchanged, + when copied with Legacy DistCp. These are now updated, even if + the file-copy is skipped.
  • +
  • Empty root directories among the source-path inputs were not + created at the target, in Legacy DistCp. These are now created.
  • +
+
+
+ + + Why does the new DistCp use more maps than legacy DistCp? + +

Legacy DistCp works by figuring out what files need to be actually + copied to target before the copy-job is launched, and then + launching as many maps as required for copy. So if a majority of the files + need to be skipped (because they already exist, for example), fewer maps + will be needed. As a consequence, the time spent in setup (i.e. before the + M/R job) is higher.

+

The new DistCp calculates only the contents of the source-paths. It + doesn't try to filter out what files can be skipped. That decision is put- + off till the M/R job runs. This is much faster (vis-a-vis execution-time), + but the number of maps launched will be as specified in the -m + option, or 20 (default) if unspecified.

+
+
+ + + Why does DistCp not run faster when more maps are specified? + +

At present, the smallest unit of work for DistCp is a file. i.e., + a file is processed by only one map. Increasing the number of maps to + a value exceeding the number of files would yield no performance + benefit. The number of maps lauched would equal the number of files.

+
+
+ + + Why does DistCp run out of memory? + +

If the number of individual files/directories being copied from + the source path(s) is extremely large (e.g. 1,000,000 paths), DistCp might + run out of memory while determining the list of paths for copy. This is + not unique to the new DistCp implementation.

+

To get around this, consider changing the -Xmx JVM + heap-size parameters, as follows:

+

bash$ export HADOOP_CLIENT_OPTS="-Xms64m -Xmx1024m"

+

bash$ hadoop distcp /source /target

+
+
+ +
+
diff --git a/hadoop-tools/hadoop-distcp/src/site/pdf.xml b/hadoop-tools/hadoop-distcp/src/site/pdf.xml new file mode 100644 index 00000000000..c717c7d00da --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/site/pdf.xml @@ -0,0 +1,47 @@ + + + + + + + + + ${project.name} + + + + + + + + + + + + ${project.name} + v. ${project.version} + User Guide + ${project.name} + Apache Hadoop + + diff --git a/hadoop-tools/hadoop-distcp/src/site/xdoc/appendix.xml b/hadoop-tools/hadoop-distcp/src/site/xdoc/appendix.xml new file mode 100644 index 00000000000..84662c076c7 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/site/xdoc/appendix.xml @@ -0,0 +1,125 @@ + + + + Appendix + + +
+ +

By default, DistCp makes an attempt to size each map comparably so + that each copies roughly the same number of bytes. Note that files are the + finest level of granularity, so increasing the number of simultaneous + copiers (i.e. maps) may not always increase the number of + simultaneous copies nor the overall throughput.

+ +

The new DistCp also provides a strategy to "dynamically" size maps, + allowing faster data-nodes to copy more bytes than slower nodes. Using + -strategy dynamic (explained in the Architecture), rather + than to assign a fixed set of source-files to each map-task, files are + instead split into several sets. The number of sets exceeds the number of + maps, usually by a factor of 2-3. Each map picks up and copies all files + listed in a chunk. When a chunk is exhausted, a new chunk is acquired and + processed, until no more chunks remain.

+ +

By not assigning a source-path to a fixed map, faster map-tasks (i.e. + data-nodes) are able to consume more chunks, and thus copy more data, + than slower nodes. While this distribution isn't uniform, it is + fair with regard to each mapper's capacity.

+ +

The dynamic-strategy is implemented by the DynamicInputFormat. It + provides superior performance under most conditions.

+ +

Tuning the number of maps to the size of the source and + destination clusters, the size of the copy, and the available + bandwidth is recommended for long-running and regularly run jobs.

+ +
+ +
+ +

For copying between two different versions of Hadoop, one will + usually use HftpFileSystem. This is a read-only FileSystem, so DistCp + must be run on the destination cluster (more specifically, on + TaskTrackers that can write to the destination cluster). Each source is + specified as hftp://<dfs.http.address>/<path> + (the default dfs.http.address is + <namenode>:50070).

+ +
+ +
+ +

As has been mentioned in the preceding, should a map fail to copy + one of its inputs, there will be several side-effects.

+ +
    + +
  • Unless -overwrite is specified, files successfully + copied by a previous map on a re-execution will be marked as + "skipped".
  • + +
  • If a map fails mapred.map.max.attempts times, the + remaining map tasks will be killed (unless -i is + set).
  • + +
  • If mapred.speculative.execution is set set + final and true, the result of the copy is + undefined.
  • + +
+ +
+ +
+ +

To use an HSFTP source (i.e. using the hsftp protocol), a Map-Red SSL + configuration file needs to be specified (via the -mapredSslConf + option). This must specify 3 parameters:

+ +
    +
  • ssl.client.truststore.location: The local-filesystem + location of the trust-store file, containing the certificate for + the namenode.
  • + +
  • ssl.client.truststore.type: (Optional) The format of + the trust-store file.
  • + +
  • ssl.client.truststore.password: (Optional) Password + for the trust-store file.
  • + +
+ +

The following is an example of the contents of the contents of + a Map-Red SSL Configuration file:

+ +


<configuration>

+ +


<property>

+

<name>ssl.client.truststore.location</name>

+

<value>/work/keystore.jks</value>

+

<description>Truststore to be used by clients like distcp. Must be specified. </description>

+


</property>

+ +

<property>

+

<name>ssl.client.truststore.password</name>

+

<value>changeme</value>

+

<description>Optional. Default value is "". </description>

+

</property>

+ +


<property>

+

<name>ssl.client.truststore.type</name>

+

<value>jks</value>

+

<description>Optional. Default value is "jks". </description>

+

</property>

+ +


</configuration>

+ +


The SSL configuration file must be in the class-path of the + DistCp program.

+ +
+ + +
diff --git a/hadoop-tools/hadoop-distcp/src/site/xdoc/architecture.xml b/hadoop-tools/hadoop-distcp/src/site/xdoc/architecture.xml new file mode 100644 index 00000000000..18c49259ae7 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/site/xdoc/architecture.xml @@ -0,0 +1,200 @@ + + + + Architecture of DistCp + + +
+ +

The components of the new DistCp may be classified into the following + categories:

+ +
    + +
  • DistCp Driver
  • +
  • Copy-listing generator
  • +
  • Input-formats and Map-Reduce components
  • + +
+ + +

The DistCp Driver components are responsible for:

+ +
    +
  • Parsing the arguments passed to the DistCp command on the + command-line, via: +
      +
    • OptionsParser, and
    • +
    • DistCpOptionsSwitch
    • +
    +
  • +
  • Assembling the command arguments into an appropriate + DistCpOptions object, and initializing DistCp. These arguments + include: +
      +
    • Source-paths
    • +
    • Target location
    • +
    • Copy options (e.g. whether to update-copy, overwrite, which + file-attributes to preserve, etc.)
    • +
    +
  • +
  • Orchestrating the copy operation by: +
      +
    • Invoking the copy-listing-generator to create the list of + files to be copied.
    • +
    • Setting up and launching the Hadoop Map-Reduce Job to carry + out the copy.
    • +
    • Based on the options, either returning a handle to the + Hadoop MR Job immediately, or waiting till completion.
    • +
    +
  • +
+
+ +

The parser-elements are exercised only from the command-line (or if + DistCp::run() is invoked). The DistCp class may also be used + programmatically, by constructing the DistCpOptions object, and + initializing a DistCp object appropriately.

+ +
+ + + +

The copy-listing-generator classes are responsible for creating the + list of files/directories to be copied from source. They examine + the contents of the source-paths (files/directories, including + wild-cards), and record all paths that need copy into a sequence- + file, for consumption by the DistCp Hadoop Job. The main classes in + this module include:

+ +
    + +
  1. CopyListing: The interface that should be implemented by any + copy-listing-generator implementation. Also provides the factory + method by which the concrete CopyListing implementation is + chosen.
  2. + +
  3. SimpleCopyListing: An implementation of CopyListing that accepts + multiple source paths (files/directories), and recursively lists + all the individual files and directories under each, for + copy.
  4. + +
  5. GlobbedCopyListing: Another implementation of CopyListing that + expands wild-cards in the source paths.
  6. + +
  7. FileBasedCopyListing: An implementation of CopyListing that + reads the source-path list from a specified file.
  8. + +
+

+ +

Based on whether a source-file-list is specified in the + DistCpOptions, the source-listing is generated in one of the + following ways:

+ +
    + +
  1. If there's no source-file-list, the GlobbedCopyListing is used. + All wild-cards are expanded, and all the expansions are + forwarded to the SimpleCopyListing, which in turn constructs the + listing (via recursive descent of each path).
  2. + +
  3. If a source-file-list is specified, the FileBasedCopyListing is + used. Source-paths are read from the specified file, and then + forwarded to the GlobbedCopyListing. The listing is then + constructed as described above.
  4. + +
+ +
+ +

One may customize the method by which the copy-listing is + constructed by providing a custom implementation of the CopyListing + interface. The behaviour of DistCp differs here from the legacy + DistCp, in how paths are considered for copy.

+ +

The legacy implementation only lists those paths that must + definitely be copied on to target. + E.g. if a file already exists at the target (and -overwrite isn't + specified), the file isn't even considered in the Map-Reduce Copy + Job. Determining this during setup (i.e. before the Map-Reduce Job) + involves file-size and checksum-comparisons that are potentially + time-consuming.

+ +

The new DistCp postpones such checks until the Map-Reduce Job, thus + reducing setup time. Performance is enhanced further since these + checks are parallelized across multiple maps.

+ +
+ + + +

The Input-formats and Map-Reduce components are responsible for + the actual copy of files and directories from the source to the + destination path. The listing-file created during copy-listing + generation is consumed at this point, when the copy is carried + out. The classes of interest here include:

+ +
    +
  • UniformSizeInputFormat: This implementation of + org.apache.hadoop.mapreduce.InputFormat provides equivalence + with Legacy DistCp in balancing load across maps. + The aim of the UniformSizeInputFormat is to make each map copy + roughly the same number of bytes. Apropos, the listing file is + split into groups of paths, such that the sum of file-sizes in + each InputSplit is nearly equal to every other map. The splitting + isn't always perfect, but its trivial implementation keeps the + setup-time low.
  • + +
  • DynamicInputFormat and DynamicRecordReader: +

    The DynamicInputFormat implements org.apache.hadoop.mapreduce.InputFormat, + and is new to DistCp. The listing-file is split into several + "chunk-files", the exact number of chunk-files being a multiple + of the number of maps requested for in the Hadoop Job. Each map + task is "assigned" one of the chunk-files (by renaming the chunk + to the task's id), before the Job is launched.

    + +

    Paths are read from each chunk using the DynamicRecordReader, + and processed in the CopyMapper. After all the paths in a chunk + are processed, the current chunk is deleted and a new chunk is + acquired. The process continues until no more chunks are + available.

    +

    This "dynamic" approach allows faster map-tasks to consume + more paths than slower ones, thus speeding up the DistCp job + overall.

    +
  • + +
  • CopyMapper: This class implements the physical + file-copy. The input-paths are checked against the input-options + (specified in the Job's Configuration), to determine whether a + file needs copy. A file will be copied only if at least one of + the following is true: +
      +
    • A file with the same name doesn't exist at target.
    • +
    • A file with the same name exists at target, but has a + different file size.
    • +
    • A file with the same name exists at target, but has a + different checksum, and -skipcrccheck isn't mentioned.
    • +
    • A file with the same name exists at target, but -overwrite + is specified.
    • +
    • A file with the same name exists at target, but differs in + block-size (and block-size needs to be preserved.
    • +
    +
  • + +
  • CopyCommitter: + This class is responsible for the commit-phase of the DistCp + job, including: +
      +
    • Preservation of directory-permissions (if specified in the + options)
    • +
    • Clean-up of temporary-files, work-directories, etc.
    • +
    +
  • +
+
+
+ +
diff --git a/hadoop-tools/hadoop-distcp/src/site/xdoc/cli.xml b/hadoop-tools/hadoop-distcp/src/site/xdoc/cli.xml new file mode 100644 index 00000000000..e4eccd54878 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/site/xdoc/cli.xml @@ -0,0 +1,123 @@ + + + + Command Line Options + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Flag Description Notes
-p[rbugp]Preserve
+ r: replication number
+ b: block size
+ u: user
+ g: group
+ p: permission
Modification times are not preserved. Also, when + -update is specified, status updates will + not be synchronized unless the file sizes + also differ (i.e. unless the file is re-created). +
-iIgnore failuresAs explained in the Appendix, this option + will keep more accurate statistics about the copy than the + default case. It also preserves logs from failed copies, which + can be valuable for debugging. Finally, a failing map will not + cause the job to fail before all splits are attempted. +
-log <logdir>Write logs to <logdir>DistCp keeps logs of each file it attempts to copy as map + output. If a map fails, the log output will not be retained if + it is re-executed. +
-m <num_maps>Maximum number of simultaneous copiesSpecify the number of maps to copy data. Note that more maps + may not necessarily improve throughput. +
-overwriteOverwrite destinationIf a map fails and -i is not specified, all the + files in the split, not only those that failed, will be recopied. + As discussed in the Usage documentation, it also changes + the semantics for generating destination paths, so users should + use this carefully. +
-updateOverwrite if src size different from dst sizeAs noted in the preceding, this is not a "sync" + operation. The only criterion examined is the source and + destination file sizes; if they differ, the source file + replaces the destination file. As discussed in the + Usage documentation, it also changes the semantics for + generating destination paths, so users should use this carefully. +
-f <urilist_uri>Use list at <urilist_uri> as src listThis is equivalent to listing each source on the command + line. The urilist_uri list should be a fully + qualified URI. +
-filelimit <n>Limit the total number of files to be <= nDeprecated! Ignored in the new DistCp. +
-sizelimit <n>Limit the total size to be <= n bytesDeprecated! Ignored in the new DistCp. +
-deleteDelete the files existing in the dst but not in srcThe deletion is done by FS Shell. So the trash will be used, + if it is enable. +
-strategy {dynamic|uniformsize}Choose the copy-strategy to be used in DistCp.By default, uniformsize is used. (i.e. Maps are balanced on the + total size of files copied by each map. Similar to legacy.) + If "dynamic" is specified, DynamicInputFormat is + used instead. (This is described in the Architecture section, + under InputFormats.) +
-bandwidthSpecify bandwidth per map, in MB/second.Each map will be restricted to consume only the specified + bandwidth. This is not always exact. The map throttles back + its bandwidth consumption during a copy, such that the + net bandwidth used tends towards the + specified value. +
-atomic {-tmp <tmp_dir>}Specify atomic commit, with optional tmp directory.-atomic instructs DistCp to copy the source + data to a temporary target location, and then move the + temporary target to the final-location atomically. Data will + either be available at final target in a complete and consistent + form, or not at all. + Optionally, -tmp may be used to specify the + location of the tmp-target. If not specified, a default is + chosen. Note: tmp_dir must be on the final + target cluster. +
-mapredSslConf <ssl_conf_file>Specify SSL Config file, to be used with HSFTP sourceWhen using the hsftp protocol with a source, the security- + related properties may be specified in a config-file and + passed to DistCp. <ssl_conf_file> needs to be in + the classpath. +
-asyncRun DistCp asynchronously. Quits as soon as the Hadoop + Job is launched.The Hadoop Job-id is logged, for tracking. +
+
+ +
diff --git a/hadoop-tools/hadoop-distcp/src/site/xdoc/index.xml b/hadoop-tools/hadoop-distcp/src/site/xdoc/index.xml new file mode 100644 index 00000000000..27108a7dcef --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/site/xdoc/index.xml @@ -0,0 +1,32 @@ + + + + DistCp + + +
+

+ DistCp (distributed copy) is a tool used for large inter/intra-cluster + copying. It uses Map/Reduce to effect its distribution, error + handling and recovery, and reporting. It expands a list of files and + directories into input to map tasks, each of which will copy a partition + of the files specified in the source list. +

+

+ The erstwhile implementation of DistCp has its share of quirks and + drawbacks, both in its usage, as well as its extensibility and + performance. The purpose of the DistCp refactor was to fix these shortcomings, + enabling it to be used and extended programmatically. New paradigms have + been introduced to improve runtime and setup performance, while simultaneously + retaining the legacy behaviour as default. +

+

+ This document aims to describe the design of the new DistCp, its spanking + new features, their optimal use, and any deviance from the legacy + implementation. +

+
+ +
diff --git a/hadoop-tools/hadoop-distcp/src/site/xdoc/usage.xml b/hadoop-tools/hadoop-distcp/src/site/xdoc/usage.xml new file mode 100644 index 00000000000..a72ec05418e --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/site/xdoc/usage.xml @@ -0,0 +1,147 @@ + + + Usage + + +
+

The most common invocation of DistCp is an inter-cluster copy:

+

bash$ hadoop jar hadoop-distcp.jar hdfs://nn1:8020/foo/bar \
+ hdfs://nn2:8020/bar/foo

+ +

This will expand the namespace under /foo/bar on nn1 + into a temporary file, partition its contents among a set of map + tasks, and start a copy on each TaskTracker from nn1 to nn2.

+ +

One can also specify multiple source directories on the command + line:

+

bash$ hadoop jar hadoop-distcp.jar hdfs://nn1:8020/foo/a \
+ hdfs://nn1:8020/foo/b \
+ hdfs://nn2:8020/bar/foo

+ +

Or, equivalently, from a file using the -f option:
+ bash$ hadoop jar hadoop-distcp.jar -f hdfs://nn1:8020/srclist \
+ hdfs://nn2:8020/bar/foo

+ +

Where srclist contains
+ hdfs://nn1:8020/foo/a
+ hdfs://nn1:8020/foo/b

+ +

When copying from multiple sources, DistCp will abort the copy with + an error message if two sources collide, but collisions at the + destination are resolved per the options + specified. By default, files already existing at the destination are + skipped (i.e. not replaced by the source file). A count of skipped + files is reported at the end of each job, but it may be inaccurate if a + copier failed for some subset of its files, but succeeded on a later + attempt.

+ +

It is important that each TaskTracker can reach and communicate with + both the source and destination file systems. For HDFS, both the source + and destination must be running the same version of the protocol or use + a backwards-compatible protocol (see Copying Between + Versions).

+ +

After a copy, it is recommended that one generates and cross-checks + a listing of the source and destination to verify that the copy was + truly successful. Since DistCp employs both Map/Reduce and the + FileSystem API, issues in or between any of the three could adversely + and silently affect the copy. Some have had success running with + -update enabled to perform a second pass, but users should + be acquainted with its semantics before attempting this.

+ +

It's also worth noting that if another client is still writing to a + source file, the copy will likely fail. Attempting to overwrite a file + being written at the destination should also fail on HDFS. If a source + file is (re)moved before it is copied, the copy will fail with a + FileNotFoundException.

+ +

Please refer to the detailed Command Line Reference for information + on all the options available in DistCp.

+ +
+
+ +

-update is used to copy files from source that don't + exist at the target, or have different contents. -overwrite + overwrites target-files even if they exist at the source, or have the + same contents.

+ +


Update and Overwrite options warrant special attention, since their + handling of source-paths varies from the defaults in a very subtle manner. + Consider a copy from /source/first/ and + /source/second/ to /target/, where the source + paths have the following contents:

+ +

hdfs://nn1:8020/source/first/1
+ hdfs://nn1:8020/source/first/2
+ hdfs://nn1:8020/source/second/10
+ hdfs://nn1:8020/source/second/20

+ +


When DistCp is invoked without -update or + -overwrite, the DistCp defaults would create directories + first/ and second/, under /target. + Thus:

+ +

distcp hdfs://nn1:8020/source/first hdfs://nn1:8020/source/second hdfs://nn2:8020/target

+


would yield the following contents in /target:

+ +

hdfs://nn2:8020/target/first/1
+ hdfs://nn2:8020/target/first/2
+ hdfs://nn2:8020/target/second/10
+ hdfs://nn2:8020/target/second/20

+ +


When either -update or -overwrite is + specified, the contents of the source-directories + are copied to target, and not the source directories themselves. Thus:

+ +

distcp -update hdfs://nn1:8020/source/first hdfs://nn1:8020/source/second hdfs://nn2:8020/target

+ +


would yield the following contents in /target:

+ +

hdfs://nn2:8020/target/1
+ hdfs://nn2:8020/target/2
+ hdfs://nn2:8020/target/10
+ hdfs://nn2:8020/target/20

+ +


By extension, if both source folders contained a file with the same + name (say, 0), then both sources would map an entry to + /target/0 at the destination. Rather than to permit this + conflict, DistCp will abort.

+ +


Now, consider the following copy operation:

+ +

distcp hdfs://nn1:8020/source/first hdfs://nn1:8020/source/second hdfs://nn2:8020/target

+ +


With sources/sizes:

+ +

hdfs://nn1:8020/source/first/1 32
+ hdfs://nn1:8020/source/first/2 32
+ hdfs://nn1:8020/source/second/10 64
+ hdfs://nn1:8020/source/second/20 32

+ +


And destination/sizes:

+ +

hdfs://nn2:8020/target/1 32
+ hdfs://nn2:8020/target/10 32
+ hdfs://nn2:8020/target/20 64

+ +


Will effect:

+ +

hdfs://nn2:8020/target/1 32
+ hdfs://nn2:8020/target/2 32
+ hdfs://nn2:8020/target/10 64
+ hdfs://nn2:8020/target/20 32

+ +


1 is skipped because the file-length and contents match. + 2 is copied because it doesn't exist at the target. + 10 and 20 are overwritten since the contents + don't match the source.

+ +

If -update is used, 1 is overwritten as well.

+ +
+ + +
diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/StubContext.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/StubContext.java new file mode 100644 index 00000000000..0223fbc2e1c --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/StubContext.java @@ -0,0 +1,139 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.mapreduce.task.MapContextImpl; +import org.apache.hadoop.mapreduce.lib.map.WrappedMapper; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.conf.Configuration; + +import java.util.List; +import java.util.ArrayList; +import java.io.IOException; + +public class StubContext { + + private StubStatusReporter reporter = new StubStatusReporter(); + private RecordReader reader; + private StubInMemoryWriter writer = new StubInMemoryWriter(); + private Mapper.Context mapperContext; + + public StubContext(Configuration conf, RecordReader reader, + int taskId) throws IOException, InterruptedException { + + WrappedMapper wrappedMapper + = new WrappedMapper(); + + MapContextImpl contextImpl + = new MapContextImpl(conf, + getTaskAttemptID(taskId), reader, writer, + null, reporter, null); + + this.reader = reader; + this.mapperContext = wrappedMapper.getMapContext(contextImpl); + } + + public Mapper.Context getContext() { + return mapperContext; + } + + public StatusReporter getReporter() { + return reporter; + } + + public RecordReader getReader() { + return reader; + } + + public StubInMemoryWriter getWriter() { + return writer; + } + + public static class StubStatusReporter extends StatusReporter { + + private Counters counters = new Counters(); + + public StubStatusReporter() { + /* + final CounterGroup counterGroup + = new CounterGroup("FileInputFormatCounters", + "FileInputFormatCounters"); + counterGroup.addCounter(new Counter("BYTES_READ", + "BYTES_READ", + 0)); + counters.addGroup(counterGroup); + */ + } + + @Override + public Counter getCounter(Enum name) { + return counters.findCounter(name); + } + + @Override + public Counter getCounter(String group, String name) { + return counters.findCounter(group, name); + } + + @Override + public void progress() {} + + @Override + public float getProgress() { + return 0F; + } + + @Override + public void setStatus(String status) {} + } + + + public static class StubInMemoryWriter extends RecordWriter { + + List keys = new ArrayList(); + + List values = new ArrayList(); + + @Override + public void write(Text key, Text value) throws IOException, InterruptedException { + keys.add(key); + values.add(value); + } + + @Override + public void close(TaskAttemptContext context) throws IOException, InterruptedException { + } + + public List keys() { + return keys; + } + + public List values() { + return values; + } + + } + + public static TaskAttemptID getTaskAttemptID(int taskId) { + return new TaskAttemptID("", 0, TaskType.MAP, taskId, 0); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java new file mode 100644 index 00000000000..32909818301 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestCopyListing.java @@ -0,0 +1,252 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.tools.util.TestDistCpUtils; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.junit.Test; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.AfterClass; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.List; +import java.util.ArrayList; + +public class TestCopyListing extends SimpleCopyListing { + private static final Log LOG = LogFactory.getLog(TestCopyListing.class); + + private static final Credentials CREDENTIALS = new Credentials(); + + private static final Configuration config = new Configuration(); + private static MiniDFSCluster cluster; + + @BeforeClass + public static void create() throws IOException { + cluster = new MiniDFSCluster.Builder(config).numDataNodes(1).format(true) + .build(); + } + + @AfterClass + public static void destroy() { + if (cluster != null) { + cluster.shutdown(); + } + } + + public TestCopyListing() { + super(config, CREDENTIALS); + } + + protected TestCopyListing(Configuration configuration) { + super(configuration, CREDENTIALS); + } + + @Override + protected long getBytesToCopy() { + return 0; + } + + @Override + protected long getNumberOfPaths() { + return 0; + } + + @Test + public void testMultipleSrcToFile() { + FileSystem fs = null; + try { + fs = FileSystem.get(getConf()); + List srcPaths = new ArrayList(); + srcPaths.add(new Path("/tmp/in/1")); + srcPaths.add(new Path("/tmp/in/2")); + Path target = new Path("/tmp/out/1"); + TestDistCpUtils.createFile(fs, "/tmp/in/1"); + TestDistCpUtils.createFile(fs, "/tmp/in/2"); + fs.mkdirs(target); + DistCpOptions options = new DistCpOptions(srcPaths, target); + validatePaths(options); + TestDistCpUtils.delete(fs, "/tmp"); + //No errors + + target = new Path("/tmp/out/1"); + fs.create(target).close(); + options = new DistCpOptions(srcPaths, target); + try { + validatePaths(options); + Assert.fail("Invalid inputs accepted"); + } catch (InvalidInputException ignore) { } + TestDistCpUtils.delete(fs, "/tmp"); + + srcPaths.clear(); + srcPaths.add(new Path("/tmp/in/1")); + fs.mkdirs(new Path("/tmp/in/1")); + target = new Path("/tmp/out/1"); + fs.create(target).close(); + options = new DistCpOptions(srcPaths, target); + try { + validatePaths(options); + Assert.fail("Invalid inputs accepted"); + } catch (InvalidInputException ignore) { } + TestDistCpUtils.delete(fs, "/tmp"); + } catch (IOException e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test input validation failed"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testDuplicates() { + FileSystem fs = null; + try { + fs = FileSystem.get(getConf()); + List srcPaths = new ArrayList(); + srcPaths.add(new Path("/tmp/in/*/*")); + TestDistCpUtils.createFile(fs, "/tmp/in/1.txt"); + TestDistCpUtils.createFile(fs, "/tmp/in/src/1.txt"); + Path target = new Path("/tmp/out"); + Path listingFile = new Path("/tmp/list"); + DistCpOptions options = new DistCpOptions(srcPaths, target); + CopyListing listing = CopyListing.getCopyListing(getConf(), CREDENTIALS, options); + try { + listing.buildListing(listingFile, options); + Assert.fail("Duplicates not detected"); + } catch (DuplicateFileException ignore) { + } + } catch (IOException e) { + LOG.error("Exception encountered in test", e); + Assert.fail("Test failed " + e.getMessage()); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testBuildListing() { + FileSystem fs = null; + try { + fs = FileSystem.get(getConf()); + List srcPaths = new ArrayList(); + Path p1 = new Path("/tmp/in/1"); + Path p2 = new Path("/tmp/in/2"); + Path p3 = new Path("/tmp/in2/2"); + Path target = new Path("/tmp/out/1"); + srcPaths.add(p1.getParent()); + srcPaths.add(p3.getParent()); + TestDistCpUtils.createFile(fs, "/tmp/in/1"); + TestDistCpUtils.createFile(fs, "/tmp/in/2"); + TestDistCpUtils.createFile(fs, "/tmp/in2/2"); + fs.mkdirs(target); + OutputStream out = fs.create(p1); + out.write("ABC".getBytes()); + out.close(); + + out = fs.create(p2); + out.write("DEF".getBytes()); + out.close(); + + out = fs.create(p3); + out.write("GHIJ".getBytes()); + out.close(); + + Path listingFile = new Path("/tmp/file"); + + DistCpOptions options = new DistCpOptions(srcPaths, target); + options.setSyncFolder(true); + CopyListing listing = new SimpleCopyListing(getConf(), CREDENTIALS); + try { + listing.buildListing(listingFile, options); + Assert.fail("Duplicates not detected"); + } catch (DuplicateFileException ignore) { + } + Assert.assertEquals(listing.getBytesToCopy(), 10); + Assert.assertEquals(listing.getNumberOfPaths(), 3); + TestDistCpUtils.delete(fs, "/tmp"); + + try { + listing.buildListing(listingFile, options); + Assert.fail("Invalid input not detected"); + } catch (InvalidInputException ignore) { + } + TestDistCpUtils.delete(fs, "/tmp"); + } catch (IOException e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test build listing failed"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testBuildListingForSingleFile() { + FileSystem fs = null; + String testRootString = "/singleFileListing"; + Path testRoot = new Path(testRootString); + SequenceFile.Reader reader = null; + try { + fs = FileSystem.get(getConf()); + if (fs.exists(testRoot)) + TestDistCpUtils.delete(fs, testRootString); + + Path sourceFile = new Path(testRoot, "/source/foo/bar/source.txt"); + Path decoyFile = new Path(testRoot, "/target/moo/source.txt"); + Path targetFile = new Path(testRoot, "/target/moo/target.txt"); + + TestDistCpUtils.createFile(fs, sourceFile.toString()); + TestDistCpUtils.createFile(fs, decoyFile.toString()); + TestDistCpUtils.createFile(fs, targetFile.toString()); + + List srcPaths = new ArrayList(); + srcPaths.add(sourceFile); + + DistCpOptions options = new DistCpOptions(srcPaths, targetFile); + CopyListing listing = new SimpleCopyListing(getConf(), CREDENTIALS); + + final Path listFile = new Path(testRoot, "/tmp/fileList.seq"); + listing.buildListing(listFile, options); + + reader = new SequenceFile.Reader(fs, listFile, getConf()); + FileStatus fileStatus = new FileStatus(); + Text relativePath = new Text(); + Assert.assertTrue(reader.next(relativePath, fileStatus)); + Assert.assertTrue(relativePath.toString().equals("")); + } + catch (Exception e) { + Assert.fail("Unexpected exception encountered."); + LOG.error("Unexpected exception: ", e); + } + finally { + TestDistCpUtils.delete(fs, testRootString); + IOUtils.closeStream(reader); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCp.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCp.java new file mode 100644 index 00000000000..d50f63ab377 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestDistCp.java @@ -0,0 +1,275 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.mapred.MiniMRCluster; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.JobSubmissionFiles; +import org.apache.hadoop.mapreduce.Cluster; +import org.apache.hadoop.tools.mapred.CopyOutputFormat; +import org.junit.*; + +import java.util.List; +import java.util.ArrayList; +import java.io.*; + +@Ignore +public class TestDistCp { + private static final Log LOG = LogFactory.getLog(TestDistCp.class); + private static List pathList = new ArrayList(); + private static final int FILE_SIZE = 1024; + + private static Configuration configuration; + private static MiniDFSCluster cluster; + private static MiniMRCluster mrCluster; + + private static final String SOURCE_PATH = "/tmp/source"; + private static final String TARGET_PATH = "/tmp/target"; + + @BeforeClass + public static void setup() throws Exception { + configuration = getConfigurationForCluster(); + cluster = new MiniDFSCluster.Builder(configuration).numDataNodes(1) + .format(true).build(); + System.setProperty("org.apache.hadoop.mapred.TaskTracker", "target/tmp"); + configuration.set("org.apache.hadoop.mapred.TaskTracker", "target/tmp"); + System.setProperty("hadoop.log.dir", "target/tmp"); + configuration.set("hadoop.log.dir", "target/tmp"); + mrCluster = new MiniMRCluster(1, cluster.getFileSystem().getUri().toString(), 1); + Configuration mrConf = mrCluster.createJobConf(); + final String mrJobTracker = mrConf.get("mapred.job.tracker"); + configuration.set("mapred.job.tracker", mrJobTracker); + final String mrJobTrackerAddress + = mrConf.get("mapred.job.tracker.http.address"); + configuration.set("mapred.job.tracker.http.address", mrJobTrackerAddress); + } + + @AfterClass + public static void cleanup() { + if (mrCluster != null) mrCluster.shutdown(); + if (cluster != null) cluster.shutdown(); + } + + private static Configuration getConfigurationForCluster() throws IOException { + Configuration configuration = new Configuration(); + System.setProperty("test.build.data", "target/build/TEST_DISTCP/data"); + configuration.set("hadoop.log.dir", "target/tmp"); + + LOG.debug("fs.default.name == " + configuration.get("fs.default.name")); + LOG.debug("dfs.http.address == " + configuration.get("dfs.http.address")); + return configuration; + } + + private static void createSourceData() throws Exception { + mkdirs(SOURCE_PATH + "/1"); + mkdirs(SOURCE_PATH + "/2"); + mkdirs(SOURCE_PATH + "/2/3/4"); + mkdirs(SOURCE_PATH + "/2/3"); + mkdirs(SOURCE_PATH + "/5"); + touchFile(SOURCE_PATH + "/5/6"); + mkdirs(SOURCE_PATH + "/7"); + mkdirs(SOURCE_PATH + "/7/8"); + touchFile(SOURCE_PATH + "/7/8/9"); + } + + private static void mkdirs(String path) throws Exception { + FileSystem fileSystem = cluster.getFileSystem(); + final Path qualifiedPath = new Path(path).makeQualified(fileSystem.getUri(), + fileSystem.getWorkingDirectory()); + pathList.add(qualifiedPath); + fileSystem.mkdirs(qualifiedPath); + } + + private static void touchFile(String path) throws Exception { + FileSystem fs; + DataOutputStream outputStream = null; + try { + fs = cluster.getFileSystem(); + final Path qualifiedPath = new Path(path).makeQualified(fs.getUri(), + fs.getWorkingDirectory()); + final long blockSize = fs.getDefaultBlockSize() * 2; + outputStream = fs.create(qualifiedPath, true, 0, + (short)(fs.getDefaultReplication()*2), + blockSize); + outputStream.write(new byte[FILE_SIZE]); + pathList.add(qualifiedPath); + } + finally { + IOUtils.cleanup(null, outputStream); + } + } + + private static void clearState() throws Exception { + pathList.clear(); + cluster.getFileSystem().delete(new Path(TARGET_PATH), true); + createSourceData(); + } + +// @Test + public void testUniformSizeDistCp() throws Exception { + try { + clearState(); + final FileSystem fileSystem = cluster.getFileSystem(); + Path sourcePath = new Path(SOURCE_PATH) + .makeQualified(fileSystem.getUri(), + fileSystem.getWorkingDirectory()); + List sources = new ArrayList(); + sources.add(sourcePath); + + Path targetPath = new Path(TARGET_PATH) + .makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory()); + DistCpOptions options = new DistCpOptions(sources, targetPath); + options.setAtomicCommit(true); + options.setBlocking(false); + Job job = new DistCp(configuration, options).execute(); + Path workDir = CopyOutputFormat.getWorkingDirectory(job); + Path finalDir = CopyOutputFormat.getCommitDirectory(job); + + while (!job.isComplete()) { + if (cluster.getFileSystem().exists(workDir)) { + break; + } + } + job.waitForCompletion(true); + Assert.assertFalse(cluster.getFileSystem().exists(workDir)); + Assert.assertTrue(cluster.getFileSystem().exists(finalDir)); + Assert.assertFalse(cluster.getFileSystem().exists( + new Path(job.getConfiguration().get(DistCpConstants.CONF_LABEL_META_FOLDER)))); + verifyResults(); + } + catch (Exception e) { + LOG.error("Exception encountered", e); + Assert.fail("Unexpected exception: " + e.getMessage()); + } + } + +// @Test + public void testCleanup() { + try { + clearState(); + Path sourcePath = new Path("noscheme:///file"); + List sources = new ArrayList(); + sources.add(sourcePath); + + final FileSystem fs = cluster.getFileSystem(); + Path targetPath = new Path(TARGET_PATH) + .makeQualified(fs.getUri(), fs.getWorkingDirectory()); + DistCpOptions options = new DistCpOptions(sources, targetPath); + + Path stagingDir = JobSubmissionFiles.getStagingDir( + new Cluster(configuration), configuration); + stagingDir.getFileSystem(configuration).mkdirs(stagingDir); + + try { + new DistCp(configuration, options).execute(); + } catch (Throwable t) { + Assert.assertEquals(stagingDir.getFileSystem(configuration). + listStatus(stagingDir).length, 0); + } + } catch (Exception e) { + LOG.error("Exception encountered ", e); + Assert.fail("testCleanup failed " + e.getMessage()); + } + } + + @Test + public void testRootPath() throws Exception { + try { + clearState(); + List sources = new ArrayList(); + final FileSystem fs = cluster.getFileSystem(); + sources.add(new Path("/a") + .makeQualified(fs.getUri(), fs.getWorkingDirectory())); + sources.add(new Path("/b") + .makeQualified(fs.getUri(), fs.getWorkingDirectory())); + touchFile("/a/a.txt"); + touchFile("/b/b.txt"); + + Path targetPath = new Path("/c") + .makeQualified(fs.getUri(), fs.getWorkingDirectory()); + DistCpOptions options = new DistCpOptions(sources, targetPath); + new DistCp(configuration, options).execute(); + Assert.assertTrue(fs.exists(new Path("/c/a/a.txt"))); + Assert.assertTrue(fs.exists(new Path("/c/b/b.txt"))); + } + catch (Exception e) { + LOG.error("Exception encountered", e); + Assert.fail("Unexpected exception: " + e.getMessage()); + } + } + + @Test + public void testDynamicDistCp() throws Exception { + try { + clearState(); + final FileSystem fs = cluster.getFileSystem(); + Path sourcePath = new Path(SOURCE_PATH) + .makeQualified(fs.getUri(), fs.getWorkingDirectory()); + List sources = new ArrayList(); + sources.add(sourcePath); + + Path targetPath = new Path(TARGET_PATH) + .makeQualified(fs.getUri(), fs.getWorkingDirectory()); + DistCpOptions options = new DistCpOptions(sources, targetPath); + options.setCopyStrategy("dynamic"); + + options.setAtomicCommit(true); + options.setAtomicWorkPath(new Path("/work")); + options.setBlocking(false); + Job job = new DistCp(configuration, options).execute(); + Path workDir = CopyOutputFormat.getWorkingDirectory(job); + Path finalDir = CopyOutputFormat.getCommitDirectory(job); + + while (!job.isComplete()) { + if (fs.exists(workDir)) { + break; + } + } + job.waitForCompletion(true); + Assert.assertFalse(fs.exists(workDir)); + Assert.assertTrue(fs.exists(finalDir)); + + verifyResults(); + } + catch (Exception e) { + LOG.error("Exception encountered", e); + Assert.fail("Unexpected exception: " + e.getMessage()); + } + } + + private static void verifyResults() throws Exception { + for (Path path : pathList) { + FileSystem fs = cluster.getFileSystem(); + + Path sourcePath = path.makeQualified(fs.getUri(), fs.getWorkingDirectory()); + Path targetPath + = new Path(sourcePath.toString().replaceAll(SOURCE_PATH, TARGET_PATH)); + + Assert.assertTrue(fs.exists(targetPath)); + Assert.assertEquals(fs.isFile(sourcePath), fs.isFile(targetPath)); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestFileBasedCopyListing.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestFileBasedCopyListing.java new file mode 100644 index 00000000000..610ff8c6ae8 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestFileBasedCopyListing.java @@ -0,0 +1,542 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.tools.util.TestDistCpUtils; +import org.apache.hadoop.security.Credentials; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.HashMap; +import java.util.Map; + +public class TestFileBasedCopyListing { + private static final Log LOG = LogFactory.getLog(TestFileBasedCopyListing.class); + + private static final Credentials CREDENTIALS = new Credentials(); + + private static final Configuration config = new Configuration(); + private static MiniDFSCluster cluster; + private static FileSystem fs; + + @BeforeClass + public static void create() throws IOException { + cluster = new MiniDFSCluster.Builder(config).numDataNodes(1).format(true) + .build(); + fs = cluster.getFileSystem(); + buildExpectedValuesMap(); + } + + @AfterClass + public static void destroy() { + if (cluster != null) { + cluster.shutdown(); + } + } + + private static Map map = new HashMap(); + + private static void buildExpectedValuesMap() { + map.put("/file1", "/tmp/singlefile1/file1"); + map.put("/file2", "/tmp/singlefile2/file2"); + map.put("/file3", "/tmp/multifile/file3"); + map.put("/file4", "/tmp/multifile/file4"); + map.put("/file5", "/tmp/multifile/file5"); + map.put("/multifile/file3", "/tmp/multifile/file3"); + map.put("/multifile/file4", "/tmp/multifile/file4"); + map.put("/multifile/file5", "/tmp/multifile/file5"); + map.put("/Ufile3", "/tmp/Umultifile/Ufile3"); + map.put("/Ufile4", "/tmp/Umultifile/Ufile4"); + map.put("/Ufile5", "/tmp/Umultifile/Ufile5"); + map.put("/dir1", "/tmp/singledir/dir1"); + map.put("/singledir/dir1", "/tmp/singledir/dir1"); + map.put("/dir2", "/tmp/singledir/dir2"); + map.put("/singledir/dir2", "/tmp/singledir/dir2"); + map.put("/Udir1", "/tmp/Usingledir/Udir1"); + map.put("/Udir2", "/tmp/Usingledir/Udir2"); + map.put("/dir2/file6", "/tmp/singledir/dir2/file6"); + map.put("/singledir/dir2/file6", "/tmp/singledir/dir2/file6"); + map.put("/file7", "/tmp/singledir1/dir3/file7"); + map.put("/file8", "/tmp/singledir1/dir3/file8"); + map.put("/file9", "/tmp/singledir1/dir3/file9"); + map.put("/dir3/file7", "/tmp/singledir1/dir3/file7"); + map.put("/dir3/file8", "/tmp/singledir1/dir3/file8"); + map.put("/dir3/file9", "/tmp/singledir1/dir3/file9"); + map.put("/Ufile7", "/tmp/Usingledir1/Udir3/Ufile7"); + map.put("/Ufile8", "/tmp/Usingledir1/Udir3/Ufile8"); + map.put("/Ufile9", "/tmp/Usingledir1/Udir3/Ufile9"); + } + + @Test + public void testSingleFileMissingTarget() { + caseSingleFileMissingTarget(false); + caseSingleFileMissingTarget(true); + } + + private void caseSingleFileMissingTarget(boolean sync) { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/singlefile1/file1"); + createFiles("/tmp/singlefile1/file1"); + + runTest(listFile, target, sync); + + checkResult(listFile, 0); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testSingleFileTargetFile() { + caseSingleFileTargetFile(false); + caseSingleFileTargetFile(true); + } + + private void caseSingleFileTargetFile(boolean sync) { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/singlefile1/file1"); + createFiles("/tmp/singlefile1/file1", target.toString()); + + runTest(listFile, target, sync); + + checkResult(listFile, 0); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testSingleFileTargetDir() { + caseSingleFileTargetDir(false); + caseSingleFileTargetDir(true); + } + + private void caseSingleFileTargetDir(boolean sync) { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/singlefile2/file2"); + createFiles("/tmp/singlefile2/file2"); + mkdirs(target.toString()); + + runTest(listFile, target, sync); + + checkResult(listFile, 1); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testSingleDirTargetMissing() { + caseSingleDirTargetMissing(false); + caseSingleDirTargetMissing(true); + } + + private void caseSingleDirTargetMissing(boolean sync) { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/singledir"); + mkdirs("/tmp/singledir/dir1"); + + runTest(listFile, target, sync); + + checkResult(listFile, 1); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testSingleDirTargetPresent() { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/singledir"); + mkdirs("/tmp/singledir/dir1"); + mkdirs(target.toString()); + + runTest(listFile, target); + + checkResult(listFile, 1); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testUpdateSingleDirTargetPresent() { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/Usingledir"); + mkdirs("/tmp/Usingledir/Udir1"); + mkdirs(target.toString()); + + runTest(listFile, target, true); + + checkResult(listFile, 1); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testMultiFileTargetPresent() { + caseMultiFileTargetPresent(false); + caseMultiFileTargetPresent(true); + } + + private void caseMultiFileTargetPresent(boolean sync) { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5"); + createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5"); + mkdirs(target.toString()); + + runTest(listFile, target, sync); + + checkResult(listFile, 3); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testMultiFileTargetMissing() { + caseMultiFileTargetMissing(false); + caseMultiFileTargetMissing(true); + } + + private void caseMultiFileTargetMissing(boolean sync) { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5"); + createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5"); + + runTest(listFile, target, sync); + + checkResult(listFile, 3); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testMultiDirTargetPresent() { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/multifile", "/tmp/singledir"); + createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5"); + mkdirs(target.toString(), "/tmp/singledir/dir1"); + + runTest(listFile, target); + + checkResult(listFile, 4); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testUpdateMultiDirTargetPresent() { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/Umultifile", "/tmp/Usingledir"); + createFiles("/tmp/Umultifile/Ufile3", "/tmp/Umultifile/Ufile4", "/tmp/Umultifile/Ufile5"); + mkdirs(target.toString(), "/tmp/Usingledir/Udir1"); + + runTest(listFile, target, true); + + checkResult(listFile, 4); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testMultiDirTargetMissing() { + caseMultiDirTargetMissing(false); + caseMultiDirTargetMissing(true); + } + + private void caseMultiDirTargetMissing(boolean sync) { + + try { + Path listFile = new Path("/tmp/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/multifile", "/tmp/singledir"); + createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5"); + mkdirs("/tmp/singledir/dir1"); + + runTest(listFile, target, sync); + + checkResult(listFile, 4); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + } + } + + @Test + public void testGlobTargetMissingSingleLevel() { + caseGlobTargetMissingSingleLevel(false); + caseGlobTargetMissingSingleLevel(true); + } + + private void caseGlobTargetMissingSingleLevel(boolean sync) { + + try { + Path listFile = new Path("/tmp1/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/*"); + createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5"); + createFiles("/tmp/singledir/dir2/file6"); + + runTest(listFile, target, sync); + + checkResult(listFile, 5); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + TestDistCpUtils.delete(fs, "/tmp1"); + } + } + + @Test + public void testGlobTargetMissingMultiLevel() { + caseGlobTargetMissingMultiLevel(false); + caseGlobTargetMissingMultiLevel(true); + } + + private void caseGlobTargetMissingMultiLevel(boolean sync) { + + try { + Path listFile = new Path("/tmp1/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/*/*"); + createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5"); + createFiles("/tmp/singledir1/dir3/file7", "/tmp/singledir1/dir3/file8", + "/tmp/singledir1/dir3/file9"); + + runTest(listFile, target, sync); + + checkResult(listFile, 6); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + TestDistCpUtils.delete(fs, "/tmp1"); + } + } + + @Test + public void testGlobTargetDirMultiLevel() { + + try { + Path listFile = new Path("/tmp1/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/*/*"); + createFiles("/tmp/multifile/file3", "/tmp/multifile/file4", "/tmp/multifile/file5"); + createFiles("/tmp/singledir1/dir3/file7", "/tmp/singledir1/dir3/file8", + "/tmp/singledir1/dir3/file9"); + mkdirs(target.toString()); + + runTest(listFile, target); + + checkResult(listFile, 6); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + TestDistCpUtils.delete(fs, "/tmp1"); + } + } + + @Test + public void testUpdateGlobTargetDirMultiLevel() { + + try { + Path listFile = new Path("/tmp1/listing"); + Path target = new Path("/tmp/target"); + + addEntries(listFile, "/tmp/*/*"); + createFiles("/tmp/Umultifile/Ufile3", "/tmp/Umultifile/Ufile4", "/tmp/Umultifile/Ufile5"); + createFiles("/tmp/Usingledir1/Udir3/Ufile7", "/tmp/Usingledir1/Udir3/Ufile8", + "/tmp/Usingledir1/Udir3/Ufile9"); + mkdirs(target.toString()); + + runTest(listFile, target, true); + + checkResult(listFile, 6); + } catch (IOException e) { + LOG.error("Exception encountered while testing build listing", e); + Assert.fail("build listing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp"); + TestDistCpUtils.delete(fs, "/tmp1"); + } + } + + private void addEntries(Path listFile, String... entries) throws IOException { + OutputStream out = fs.create(listFile); + try { + for (String entry : entries){ + out.write(entry.getBytes()); + out.write("\n".getBytes()); + } + } finally { + out.close(); + } + } + + private void createFiles(String... entries) throws IOException { + for (String entry : entries){ + OutputStream out = fs.create(new Path(entry)); + try { + out.write(entry.getBytes()); + out.write("\n".getBytes()); + } finally { + out.close(); + } + } + } + + private void mkdirs(String... entries) throws IOException { + for (String entry : entries){ + fs.mkdirs(new Path(entry)); + } + } + + private void runTest(Path listFile, Path target) throws IOException { + runTest(listFile, target, true); + } + + private void runTest(Path listFile, Path target, boolean sync) throws IOException { + CopyListing listing = new FileBasedCopyListing(config, CREDENTIALS); + DistCpOptions options = new DistCpOptions(listFile, target); + options.setSyncFolder(sync); + listing.buildListing(listFile, options); + } + + private void checkResult(Path listFile, int count) throws IOException { + if (count == 0) { + return; + } + + int recCount = 0; + SequenceFile.Reader reader = new SequenceFile.Reader(config, + SequenceFile.Reader.file(listFile)); + try { + Text relPath = new Text(); + FileStatus fileStatus = new FileStatus(); + while (reader.next(relPath, fileStatus)) { + Assert.assertEquals(fileStatus.getPath().toUri().getPath(), map.get(relPath.toString())); + recCount++; + } + } finally { + IOUtils.closeStream(reader); + } + Assert.assertEquals(recCount, count); + } + +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestGlobbedCopyListing.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestGlobbedCopyListing.java new file mode 100644 index 00000000000..572a866fe6c --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestGlobbedCopyListing.java @@ -0,0 +1,135 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.apache.hadoop.security.Credentials; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.DataOutputStream; +import java.net.URI; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +public class TestGlobbedCopyListing { + + private static MiniDFSCluster cluster; + + private static final Credentials CREDENTIALS = new Credentials(); + + public static Map expectedValues = new HashMap(); + + @BeforeClass + public static void setup() throws Exception { + cluster = new MiniDFSCluster(new Configuration(), 1, true, null); + createSourceData(); + } + + private static void createSourceData() throws Exception { + mkdirs("/tmp/source/1"); + mkdirs("/tmp/source/2"); + mkdirs("/tmp/source/2/3"); + mkdirs("/tmp/source/2/3/4"); + mkdirs("/tmp/source/5"); + touchFile("/tmp/source/5/6"); + mkdirs("/tmp/source/7"); + mkdirs("/tmp/source/7/8"); + touchFile("/tmp/source/7/8/9"); + } + + private static void mkdirs(String path) throws Exception { + FileSystem fileSystem = null; + try { + fileSystem = cluster.getFileSystem(); + fileSystem.mkdirs(new Path(path)); + recordInExpectedValues(path); + } + finally { + IOUtils.cleanup(null, fileSystem); + } + } + + private static void touchFile(String path) throws Exception { + FileSystem fileSystem = null; + DataOutputStream outputStream = null; + try { + fileSystem = cluster.getFileSystem(); + outputStream = fileSystem.create(new Path(path), true, 0); + recordInExpectedValues(path); + } + finally { + IOUtils.cleanup(null, fileSystem, outputStream); + } + } + + private static void recordInExpectedValues(String path) throws Exception { + FileSystem fileSystem = cluster.getFileSystem(); + Path sourcePath = new Path(fileSystem.getUri().toString() + path); + expectedValues.put(sourcePath.toString(), DistCpUtils.getRelativePath( + new Path("/tmp/source"), sourcePath)); + } + + @AfterClass + public static void tearDown() { + cluster.shutdown(); + } + + @Test + public void testRun() throws Exception { + final URI uri = cluster.getFileSystem().getUri(); + final String pathString = uri.toString(); + Path fileSystemPath = new Path(pathString); + Path source = new Path(fileSystemPath.toString() + "/tmp/source"); + Path target = new Path(fileSystemPath.toString() + "/tmp/target"); + Path listingPath = new Path(fileSystemPath.toString() + "/tmp/META/fileList.seq"); + DistCpOptions options = new DistCpOptions(Arrays.asList(source), target); + + new GlobbedCopyListing(new Configuration(), CREDENTIALS).buildListing(listingPath, options); + + verifyContents(listingPath); + } + + private void verifyContents(Path listingPath) throws Exception { + SequenceFile.Reader reader = new SequenceFile.Reader(cluster.getFileSystem(), + listingPath, new Configuration()); + Text key = new Text(); + FileStatus value = new FileStatus(); + Map actualValues = new HashMap(); + while (reader.next(key, value)) { + actualValues.put(value.getPath().toString(), key.toString()); + } + + Assert.assertEquals(expectedValues.size(), actualValues.size()); + for (Map.Entry entry : actualValues.entrySet()) { + Assert.assertEquals(entry.getValue(), expectedValues.get(entry.getKey())); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestIntegration.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestIntegration.java new file mode 100644 index 00000000000..303a4e05fc5 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestIntegration.java @@ -0,0 +1,466 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.tools.util.TestDistCpUtils; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.IOException; +import java.io.OutputStream; + +public class TestIntegration { + private static final Log LOG = LogFactory.getLog(TestIntegration.class); + + private static FileSystem fs; + + private static Path listFile; + private static Path target; + private static String root; + + private static Configuration getConf() { + Configuration conf = new Configuration(); + conf.set("fs.default.name", "file:///"); + conf.set("mapred.job.tracker", "local"); + return conf; + } + + @BeforeClass + public static void setup() { + try { + fs = FileSystem.get(getConf()); + listFile = new Path("target/tmp/listing").makeQualified(fs.getUri(), + fs.getWorkingDirectory()); + target = new Path("target/tmp/target").makeQualified(fs.getUri(), + fs.getWorkingDirectory()); + root = new Path("target/tmp").makeQualified(fs.getUri(), + fs.getWorkingDirectory()).toString(); + TestDistCpUtils.delete(fs, root); + } catch (IOException e) { + LOG.error("Exception encountered ", e); + } + } + + @Test + public void testSingleFileMissingTarget() { + caseSingleFileMissingTarget(false); + caseSingleFileMissingTarget(true); + } + + private void caseSingleFileMissingTarget(boolean sync) { + + try { + addEntries(listFile, "singlefile1/file1"); + createFiles("singlefile1/file1"); + + runTest(listFile, target, sync); + + checkResult(target, 1); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testSingleFileTargetFile() { + caseSingleFileTargetFile(false); + caseSingleFileTargetFile(true); + } + + private void caseSingleFileTargetFile(boolean sync) { + + try { + addEntries(listFile, "singlefile1/file1"); + createFiles("singlefile1/file1", target.toString()); + + runTest(listFile, target, sync); + + checkResult(target, 1); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testSingleFileTargetDir() { + caseSingleFileTargetDir(false); + caseSingleFileTargetDir(true); + } + + private void caseSingleFileTargetDir(boolean sync) { + + try { + addEntries(listFile, "singlefile2/file2"); + createFiles("singlefile2/file2"); + mkdirs(target.toString()); + + runTest(listFile, target, sync); + + checkResult(target, 1, "file2"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testSingleDirTargetMissing() { + caseSingleDirTargetMissing(false); + caseSingleDirTargetMissing(true); + } + + private void caseSingleDirTargetMissing(boolean sync) { + + try { + addEntries(listFile, "singledir"); + mkdirs(root + "/singledir/dir1"); + + runTest(listFile, target, sync); + + checkResult(target, 1, "dir1"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testSingleDirTargetPresent() { + + try { + addEntries(listFile, "singledir"); + mkdirs(root + "/singledir/dir1"); + mkdirs(target.toString()); + + runTest(listFile, target, false); + + checkResult(target, 1, "singledir/dir1"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testUpdateSingleDirTargetPresent() { + + try { + addEntries(listFile, "Usingledir"); + mkdirs(root + "/Usingledir/Udir1"); + mkdirs(target.toString()); + + runTest(listFile, target, true); + + checkResult(target, 1, "Udir1"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testMultiFileTargetPresent() { + caseMultiFileTargetPresent(false); + caseMultiFileTargetPresent(true); + } + + private void caseMultiFileTargetPresent(boolean sync) { + + try { + addEntries(listFile, "multifile/file3", "multifile/file4", "multifile/file5"); + createFiles("multifile/file3", "multifile/file4", "multifile/file5"); + mkdirs(target.toString()); + + runTest(listFile, target, sync); + + checkResult(target, 3, "file3", "file4", "file5"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testMultiFileTargetMissing() { + caseMultiFileTargetMissing(false); + caseMultiFileTargetMissing(true); + } + + private void caseMultiFileTargetMissing(boolean sync) { + + try { + addEntries(listFile, "multifile/file3", "multifile/file4", "multifile/file5"); + createFiles("multifile/file3", "multifile/file4", "multifile/file5"); + + runTest(listFile, target, sync); + + checkResult(target, 3, "file3", "file4", "file5"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testMultiDirTargetPresent() { + + try { + addEntries(listFile, "multifile", "singledir"); + createFiles("multifile/file3", "multifile/file4", "multifile/file5"); + mkdirs(target.toString(), root + "/singledir/dir1"); + + runTest(listFile, target, false); + + checkResult(target, 2, "multifile/file3", "multifile/file4", "multifile/file5", "singledir/dir1"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testUpdateMultiDirTargetPresent() { + + try { + addEntries(listFile, "Umultifile", "Usingledir"); + createFiles("Umultifile/Ufile3", "Umultifile/Ufile4", "Umultifile/Ufile5"); + mkdirs(target.toString(), root + "/Usingledir/Udir1"); + + runTest(listFile, target, true); + + checkResult(target, 4, "Ufile3", "Ufile4", "Ufile5", "Udir1"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testMultiDirTargetMissing() { + + try { + addEntries(listFile, "multifile", "singledir"); + createFiles("multifile/file3", "multifile/file4", "multifile/file5"); + mkdirs(root + "/singledir/dir1"); + + runTest(listFile, target, false); + + checkResult(target, 2, "multifile/file3", "multifile/file4", + "multifile/file5", "singledir/dir1"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testUpdateMultiDirTargetMissing() { + + try { + addEntries(listFile, "multifile", "singledir"); + createFiles("multifile/file3", "multifile/file4", "multifile/file5"); + mkdirs(root + "/singledir/dir1"); + + runTest(listFile, target, true); + + checkResult(target, 4, "file3", "file4", "file5", "dir1"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + } + } + + @Test + public void testGlobTargetMissingSingleLevel() { + + try { + Path listFile = new Path("target/tmp1/listing").makeQualified(fs.getUri(), + fs.getWorkingDirectory()); + addEntries(listFile, "*"); + createFiles("multifile/file3", "multifile/file4", "multifile/file5"); + createFiles("singledir/dir2/file6"); + + runTest(listFile, target, false); + + checkResult(target, 2, "multifile/file3", "multifile/file4", "multifile/file5", + "singledir/dir2/file6"); + } catch (IOException e) { + LOG.error("Exception encountered while testing distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + TestDistCpUtils.delete(fs, "target/tmp1"); + } + } + + @Test + public void testUpdateGlobTargetMissingSingleLevel() { + + try { + Path listFile = new Path("target/tmp1/listing").makeQualified(fs.getUri(), + fs.getWorkingDirectory()); + addEntries(listFile, "*"); + createFiles("multifile/file3", "multifile/file4", "multifile/file5"); + createFiles("singledir/dir2/file6"); + + runTest(listFile, target, true); + + checkResult(target, 4, "file3", "file4", "file5", "dir2/file6"); + } catch (IOException e) { + LOG.error("Exception encountered while running distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + TestDistCpUtils.delete(fs, "target/tmp1"); + } + } + + @Test + public void testGlobTargetMissingMultiLevel() { + + try { + Path listFile = new Path("target/tmp1/listing").makeQualified(fs.getUri(), + fs.getWorkingDirectory()); + addEntries(listFile, "*/*"); + createFiles("multifile/file3", "multifile/file4", "multifile/file5"); + createFiles("singledir1/dir3/file7", "singledir1/dir3/file8", + "singledir1/dir3/file9"); + + runTest(listFile, target, false); + + checkResult(target, 4, "file3", "file4", "file5", + "dir3/file7", "dir3/file8", "dir3/file9"); + } catch (IOException e) { + LOG.error("Exception encountered while running distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + TestDistCpUtils.delete(fs, "target/tmp1"); + } + } + + @Test + public void testUpdateGlobTargetMissingMultiLevel() { + + try { + Path listFile = new Path("target/tmp1/listing").makeQualified(fs.getUri(), + fs.getWorkingDirectory()); + addEntries(listFile, "*/*"); + createFiles("multifile/file3", "multifile/file4", "multifile/file5"); + createFiles("singledir1/dir3/file7", "singledir1/dir3/file8", + "singledir1/dir3/file9"); + + runTest(listFile, target, true); + + checkResult(target, 6, "file3", "file4", "file5", + "file7", "file8", "file9"); + } catch (IOException e) { + LOG.error("Exception encountered while running distcp", e); + Assert.fail("distcp failure"); + } finally { + TestDistCpUtils.delete(fs, root); + TestDistCpUtils.delete(fs, "target/tmp1"); + } + } + + private void addEntries(Path listFile, String... entries) throws IOException { + OutputStream out = fs.create(listFile); + try { + for (String entry : entries){ + out.write((root + "/" + entry).getBytes()); + out.write("\n".getBytes()); + } + } finally { + out.close(); + } + } + + private void createFiles(String... entries) throws IOException { + for (String entry : entries){ + OutputStream out = fs.create(new Path(root + "/" + entry)); + try { + out.write((root + "/" + entry).getBytes()); + out.write("\n".getBytes()); + } finally { + out.close(); + } + } + } + + private void mkdirs(String... entries) throws IOException { + for (String entry : entries){ + fs.mkdirs(new Path(entry)); + } + } + + private void runTest(Path listFile, Path target, boolean sync) throws IOException { + DistCpOptions options = new DistCpOptions(listFile, target); + options.setSyncFolder(sync); + try { + new DistCp(getConf(), options).execute(); + } catch (Exception e) { + LOG.error("Exception encountered ", e); + throw new IOException(e); + } + } + + private void checkResult(Path target, int count, String... relPaths) throws IOException { + Assert.assertEquals(count, fs.listStatus(target).length); + if (relPaths == null || relPaths.length == 0) { + Assert.assertTrue(target.toString(), fs.exists(target)); + return; + } + for (String relPath : relPaths) { + Assert.assertTrue(new Path(target, relPath).toString(), fs.exists(new Path(target, relPath))); + } + } + +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestOptionsParser.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestOptionsParser.java new file mode 100644 index 00000000000..14c409e8b26 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/TestOptionsParser.java @@ -0,0 +1,497 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools; + +import org.junit.Assert; +import org.junit.Test; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.tools.DistCpOptions.*; +import org.apache.hadoop.conf.Configuration; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +public class TestOptionsParser { + + @Test + public void testParseIgnoreFailure() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertFalse(options.shouldIgnoreFailures()); + + options = OptionsParser.parse(new String[] { + "-i", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldIgnoreFailures()); + } + + @Test + public void testParseOverwrite() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertFalse(options.shouldOverwrite()); + + options = OptionsParser.parse(new String[] { + "-overwrite", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldOverwrite()); + + try { + OptionsParser.parse(new String[] { + "-update", + "-overwrite", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.fail("Update and overwrite aren't allowed together"); + } catch (IllegalArgumentException ignore) { + } + } + + @Test + public void testLogPath() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertNull(options.getLogPath()); + + options = OptionsParser.parse(new String[] { + "-log", + "hdfs://localhost:8020/logs", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getLogPath(), new Path("hdfs://localhost:8020/logs")); + } + + @Test + public void testParseBlokcing() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldBlock()); + + options = OptionsParser.parse(new String[] { + "-async", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertFalse(options.shouldBlock()); + } + + @Test + public void testParsebandwidth() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getMapBandwidth(), DistCpConstants.DEFAULT_BANDWIDTH_MB); + + options = OptionsParser.parse(new String[] { + "-bandwidth", + "11", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getMapBandwidth(), 11); + } + + @Test + public void testParseSkipCRC() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertFalse(options.shouldSkipCRC()); + + options = OptionsParser.parse(new String[] { + "-update", + "-skipcrccheck", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldSyncFolder()); + Assert.assertTrue(options.shouldSkipCRC()); + } + + @Test + public void testParseAtomicCommit() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertFalse(options.shouldAtomicCommit()); + + options = OptionsParser.parse(new String[] { + "-atomic", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldAtomicCommit()); + + try { + OptionsParser.parse(new String[] { + "-atomic", + "-update", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.fail("Atomic and sync folders were allowed"); + } catch (IllegalArgumentException ignore) { } + } + + @Test + public void testParseWorkPath() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertNull(options.getAtomicWorkPath()); + + options = OptionsParser.parse(new String[] { + "-atomic", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertNull(options.getAtomicWorkPath()); + + options = OptionsParser.parse(new String[] { + "-atomic", + "-tmp", + "hdfs://localhost:8020/work", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getAtomicWorkPath(), new Path("hdfs://localhost:8020/work")); + + try { + OptionsParser.parse(new String[] { + "-tmp", + "hdfs://localhost:8020/work", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.fail("work path was allowed without -atomic switch"); + } catch (IllegalArgumentException ignore) {} + } + + @Test + public void testParseSyncFolders() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertFalse(options.shouldSyncFolder()); + + options = OptionsParser.parse(new String[] { + "-update", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldSyncFolder()); + } + + @Test + public void testParseDeleteMissing() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertFalse(options.shouldDeleteMissing()); + + options = OptionsParser.parse(new String[] { + "-update", + "-delete", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldSyncFolder()); + Assert.assertTrue(options.shouldDeleteMissing()); + + options = OptionsParser.parse(new String[] { + "-overwrite", + "-delete", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldOverwrite()); + Assert.assertTrue(options.shouldDeleteMissing()); + + try { + OptionsParser.parse(new String[] { + "-atomic", + "-delete", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.fail("Atomic and delete folders were allowed"); + } catch (IllegalArgumentException ignore) { } + } + + @Test + public void testParseSSLConf() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertNull(options.getSslConfigurationFile()); + + options = OptionsParser.parse(new String[] { + "-mapredSslConf", + "/tmp/ssl-client.xml", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getSslConfigurationFile(), "/tmp/ssl-client.xml"); + } + + @Test + public void testParseMaps() { + DistCpOptions options = OptionsParser.parse(new String[] { + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getMaxMaps(), DistCpConstants.DEFAULT_MAPS); + + options = OptionsParser.parse(new String[] { + "-m", + "1", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getMaxMaps(), 1); + + try { + OptionsParser.parse(new String[] { + "-m", + "hello", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.fail("Non numberic map parsed"); + } catch (IllegalArgumentException ignore) { } + + try { + OptionsParser.parse(new String[] { + "-mapredXslConf", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.fail("Non numberic map parsed"); + } catch (IllegalArgumentException ignore) { } + } + + @Test + public void testSourceListing() { + DistCpOptions options = OptionsParser.parse(new String[] { + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getSourceFileListing(), + new Path("hdfs://localhost:8020/source/first")); + } + + @Test + public void testSourceListingAndSourcePath() { + try { + OptionsParser.parse(new String[] { + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.fail("Both source listing & source paths allowed"); + } catch (IllegalArgumentException ignore) {} + } + + @Test + public void testMissingSourceInfo() { + try { + OptionsParser.parse(new String[] { + "hdfs://localhost:8020/target/"}); + Assert.fail("Neither source listing not source paths present"); + } catch (IllegalArgumentException ignore) {} + } + + @Test + public void testMissingTarget() { + try { + OptionsParser.parse(new String[] { + "-f", "hdfs://localhost:8020/source"}); + Assert.fail("Missing target allowed"); + } catch (IllegalArgumentException ignore) {} + } + + @Test + public void testInvalidArgs() { + try { + OptionsParser.parse(new String[] { + "-m", "-f", "hdfs://localhost:8020/source"}); + Assert.fail("Missing map value"); + } catch (IllegalArgumentException ignore) {} + } + + @Test + public void testToString() { + DistCpOptions option = new DistCpOptions(new Path("abc"), new Path("xyz")); + String val = "DistCpOptions{atomicCommit=false, syncFolder=false, deleteMissing=false, " + + "ignoreFailures=false, maxMaps=20, sslConfigurationFile='null', copyStrategy='uniformsize', " + + "sourceFileListing=abc, sourcePaths=null, targetPath=xyz}"; + Assert.assertEquals(val, option.toString()); + Assert.assertNotSame(DistCpOptionSwitch.ATOMIC_COMMIT.toString(), + DistCpOptionSwitch.ATOMIC_COMMIT.name()); + } + + @Test + public void testCopyStrategy() { + DistCpOptions options = OptionsParser.parse(new String[] { + "-strategy", + "dynamic", + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getCopyStrategy(), "dynamic"); + + options = OptionsParser.parse(new String[] { + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getCopyStrategy(), DistCpConstants.UNIFORMSIZE); + } + + @Test + public void testTargetPath() { + DistCpOptions options = OptionsParser.parse(new String[] { + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertEquals(options.getTargetPath(), new Path("hdfs://localhost:8020/target/")); + } + + @Test + public void testPreserve() { + DistCpOptions options = OptionsParser.parse(new String[] { + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertFalse(options.shouldPreserve(FileAttribute.BLOCKSIZE)); + Assert.assertFalse(options.shouldPreserve(FileAttribute.REPLICATION)); + Assert.assertFalse(options.shouldPreserve(FileAttribute.PERMISSION)); + Assert.assertFalse(options.shouldPreserve(FileAttribute.USER)); + Assert.assertFalse(options.shouldPreserve(FileAttribute.GROUP)); + + options = OptionsParser.parse(new String[] { + "-p", + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldPreserve(FileAttribute.BLOCKSIZE)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.REPLICATION)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.PERMISSION)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.USER)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.GROUP)); + + options = OptionsParser.parse(new String[] { + "-p", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldPreserve(FileAttribute.BLOCKSIZE)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.REPLICATION)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.PERMISSION)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.USER)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.GROUP)); + + options = OptionsParser.parse(new String[] { + "-pbr", + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldPreserve(FileAttribute.BLOCKSIZE)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.REPLICATION)); + Assert.assertFalse(options.shouldPreserve(FileAttribute.PERMISSION)); + Assert.assertFalse(options.shouldPreserve(FileAttribute.USER)); + Assert.assertFalse(options.shouldPreserve(FileAttribute.GROUP)); + + options = OptionsParser.parse(new String[] { + "-pbrgup", + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertTrue(options.shouldPreserve(FileAttribute.BLOCKSIZE)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.REPLICATION)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.PERMISSION)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.USER)); + Assert.assertTrue(options.shouldPreserve(FileAttribute.GROUP)); + + options = OptionsParser.parse(new String[] { + "-p", + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + int i = 0; + Iterator attribIterator = options.preserveAttributes(); + while (attribIterator.hasNext()) { + attribIterator.next(); + i++; + } + Assert.assertEquals(i, 5); + + try { + OptionsParser.parse(new String[] { + "-pabc", + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target"}); + Assert.fail("Invalid preserve attribute"); + } + catch (IllegalArgumentException ignore) {} + catch (NoSuchElementException ignore) {} + + options = OptionsParser.parse(new String[] { + "-f", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + Assert.assertFalse(options.shouldPreserve(FileAttribute.PERMISSION)); + options.preserve(FileAttribute.PERMISSION); + Assert.assertTrue(options.shouldPreserve(FileAttribute.PERMISSION)); + + options.preserve(FileAttribute.PERMISSION); + Assert.assertTrue(options.shouldPreserve(FileAttribute.PERMISSION)); + } + + @Test + public void testOptionsSwitchAddToConf() { + Configuration conf = new Configuration(); + Assert.assertNull(conf.get(DistCpOptionSwitch.ATOMIC_COMMIT.getConfigLabel())); + DistCpOptionSwitch.addToConf(conf, DistCpOptionSwitch.ATOMIC_COMMIT); + Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.ATOMIC_COMMIT.getConfigLabel(), false)); + } + + @Test + public void testOptionsAppendToConf() { + Configuration conf = new Configuration(); + Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false)); + Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.ATOMIC_COMMIT.getConfigLabel(), false)); + DistCpOptions options = OptionsParser.parse(new String[] { + "-atomic", + "-i", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + options.appendToConf(conf); + Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false)); + Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.ATOMIC_COMMIT.getConfigLabel(), false)); + Assert.assertEquals(conf.getInt(DistCpOptionSwitch.BANDWIDTH.getConfigLabel(), -1), + DistCpConstants.DEFAULT_BANDWIDTH_MB); + + conf = new Configuration(); + Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false)); + Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.DELETE_MISSING.getConfigLabel(), false)); + Assert.assertEquals(conf.get(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel()), null); + options = OptionsParser.parse(new String[] { + "-update", + "-delete", + "-pu", + "-bandwidth", + "11", + "hdfs://localhost:8020/source/first", + "hdfs://localhost:8020/target/"}); + options.appendToConf(conf); + Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false)); + Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.DELETE_MISSING.getConfigLabel(), false)); + Assert.assertEquals(conf.get(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel()), "U"); + Assert.assertEquals(conf.getInt(DistCpOptionSwitch.BANDWIDTH.getConfigLabel(), -1), 11); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyCommitter.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyCommitter.java new file mode 100644 index 00000000000..82736f4d3cb --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyCommitter.java @@ -0,0 +1,419 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; +import org.apache.hadoop.mapreduce.task.JobContextImpl; +import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; +import org.apache.hadoop.tools.CopyListing; +import org.apache.hadoop.tools.DistCpConstants; +import org.apache.hadoop.tools.DistCpOptions; +import org.apache.hadoop.tools.DistCpOptions.FileAttribute; +import org.apache.hadoop.tools.GlobbedCopyListing; +import org.apache.hadoop.tools.util.TestDistCpUtils; +import org.apache.hadoop.security.Credentials; +import org.junit.*; + +import java.io.IOException; +import java.util.*; + +public class TestCopyCommitter { + private static final Log LOG = LogFactory.getLog(TestCopyCommitter.class); + + private static final Random rand = new Random(); + + private static final Credentials CREDENTIALS = new Credentials(); + public static final int PORT = 39737; + + + private static Configuration config; + private static MiniDFSCluster cluster; + + private static Job getJobForClient() throws IOException { + Job job = Job.getInstance(new Configuration()); + job.getConfiguration().set("mapred.job.tracker", "localhost:" + PORT); + job.setInputFormatClass(NullInputFormat.class); + job.setOutputFormatClass(NullOutputFormat.class); + job.setNumReduceTasks(0); + return job; + } + + @BeforeClass + public static void create() throws IOException { + config = getJobForClient().getConfiguration(); + config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, 0); + cluster = new MiniDFSCluster.Builder(config).numDataNodes(1).format(true) + .build(); + } + + @AfterClass + public static void destroy() { + if (cluster != null) { + cluster.shutdown(); + } + } + + @Before + public void createMetaFolder() { + config.set(DistCpConstants.CONF_LABEL_META_FOLDER, "/meta"); + Path meta = new Path("/meta"); + try { + cluster.getFileSystem().mkdirs(meta); + } catch (IOException e) { + LOG.error("Exception encountered while creating meta folder", e); + Assert.fail("Unable to create meta folder"); + } + } + + @After + public void cleanupMetaFolder() { + Path meta = new Path("/meta"); + try { + if (cluster.getFileSystem().exists(meta)) { + cluster.getFileSystem().delete(meta, true); + Assert.fail("Expected meta folder to be deleted"); + } + } catch (IOException e) { + LOG.error("Exception encountered while cleaning up folder", e); + Assert.fail("Unable to clean up meta folder"); + } + } + + @Test + public void testNoCommitAction() { + TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); + JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), + taskAttemptContext.getTaskAttemptID().getJobID()); + try { + OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); + committer.commitJob(jobContext); + Assert.assertEquals(taskAttemptContext.getStatus(), "Commit Successful"); + + //Test for idempotent commit + committer.commitJob(jobContext); + Assert.assertEquals(taskAttemptContext.getStatus(), "Commit Successful"); + } catch (IOException e) { + LOG.error("Exception encountered ", e); + Assert.fail("Commit failed"); + } + } + + @Test + public void testPreserveStatus() { + TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); + JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), + taskAttemptContext.getTaskAttemptID().getJobID()); + Configuration conf = jobContext.getConfiguration(); + + + String sourceBase; + String targetBase; + FileSystem fs = null; + try { + OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); + fs = FileSystem.get(conf); + FsPermission sourcePerm = new FsPermission((short) 511); + FsPermission initialPerm = new FsPermission((short) 448); + sourceBase = TestDistCpUtils.createTestSetup(fs, sourcePerm); + targetBase = TestDistCpUtils.createTestSetup(fs, initialPerm); + + DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), + new Path("/out")); + options.preserve(FileAttribute.PERMISSION); + options.appendToConf(conf); + + CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS); + Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong())); + listing.buildListing(listingFile, options); + + conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase); + + committer.commitJob(jobContext); + if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) { + Assert.fail("Permission don't match"); + } + + //Test for idempotent commit + committer.commitJob(jobContext); + if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) { + Assert.fail("Permission don't match"); + } + + } catch (IOException e) { + LOG.error("Exception encountered while testing for preserve status", e); + Assert.fail("Preserve status failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp1"); + } + + } + + @Test + public void testDeleteMissing() { + TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); + JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), + taskAttemptContext.getTaskAttemptID().getJobID()); + Configuration conf = jobContext.getConfiguration(); + + String sourceBase; + String targetBase; + FileSystem fs = null; + try { + OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); + fs = FileSystem.get(conf); + sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault()); + targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault()); + String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault()); + fs.rename(new Path(targetBaseAdd), new Path(targetBase)); + + DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), + new Path("/out")); + options.setSyncFolder(true); + options.setDeleteMissing(true); + options.appendToConf(conf); + + CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS); + Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong())); + listing.buildListing(listingFile, options); + + conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase); + conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase); + + committer.commitJob(jobContext); + if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) { + Assert.fail("Source and target folders are not in sync"); + } + if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) { + Assert.fail("Source and target folders are not in sync"); + } + + //Test for idempotent commit + committer.commitJob(jobContext); + if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) { + Assert.fail("Source and target folders are not in sync"); + } + if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) { + Assert.fail("Source and target folders are not in sync"); + } + } catch (Throwable e) { + LOG.error("Exception encountered while testing for delete missing", e); + Assert.fail("Delete missing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp1"); + conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false"); + } + } + + @Test + public void testDeleteMissingFlatInterleavedFiles() { + TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); + JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), + taskAttemptContext.getTaskAttemptID().getJobID()); + Configuration conf = jobContext.getConfiguration(); + + + String sourceBase; + String targetBase; + FileSystem fs = null; + try { + OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); + fs = FileSystem.get(conf); + sourceBase = "/tmp1/" + String.valueOf(rand.nextLong()); + targetBase = "/tmp1/" + String.valueOf(rand.nextLong()); + TestDistCpUtils.createFile(fs, sourceBase + "/1"); + TestDistCpUtils.createFile(fs, sourceBase + "/3"); + TestDistCpUtils.createFile(fs, sourceBase + "/4"); + TestDistCpUtils.createFile(fs, sourceBase + "/5"); + TestDistCpUtils.createFile(fs, sourceBase + "/7"); + TestDistCpUtils.createFile(fs, sourceBase + "/8"); + TestDistCpUtils.createFile(fs, sourceBase + "/9"); + + TestDistCpUtils.createFile(fs, targetBase + "/2"); + TestDistCpUtils.createFile(fs, targetBase + "/4"); + TestDistCpUtils.createFile(fs, targetBase + "/5"); + TestDistCpUtils.createFile(fs, targetBase + "/7"); + TestDistCpUtils.createFile(fs, targetBase + "/9"); + TestDistCpUtils.createFile(fs, targetBase + "/A"); + + DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), + new Path("/out")); + options.setSyncFolder(true); + options.setDeleteMissing(true); + options.appendToConf(conf); + + CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS); + Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong())); + listing.buildListing(listingFile, options); + + conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase); + conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase); + + committer.commitJob(jobContext); + if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) { + Assert.fail("Source and target folders are not in sync"); + } + Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4); + + //Test for idempotent commit + committer.commitJob(jobContext); + if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) { + Assert.fail("Source and target folders are not in sync"); + } + Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4); + } catch (IOException e) { + LOG.error("Exception encountered while testing for delete missing", e); + Assert.fail("Delete missing failure"); + } finally { + TestDistCpUtils.delete(fs, "/tmp1"); + conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false"); + } + + } + + @Test + public void testAtomicCommitMissingFinal() { + TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); + JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), + taskAttemptContext.getTaskAttemptID().getJobID()); + Configuration conf = jobContext.getConfiguration(); + + String workPath = "/tmp1/" + String.valueOf(rand.nextLong()); + String finalPath = "/tmp1/" + String.valueOf(rand.nextLong()); + FileSystem fs = null; + try { + OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); + fs = FileSystem.get(conf); + fs.mkdirs(new Path(workPath)); + + conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, workPath); + conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, finalPath); + conf.setBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, true); + + Assert.assertTrue(fs.exists(new Path(workPath))); + Assert.assertFalse(fs.exists(new Path(finalPath))); + committer.commitJob(jobContext); + Assert.assertFalse(fs.exists(new Path(workPath))); + Assert.assertTrue(fs.exists(new Path(finalPath))); + + //Test for idempotent commit + committer.commitJob(jobContext); + Assert.assertFalse(fs.exists(new Path(workPath))); + Assert.assertTrue(fs.exists(new Path(finalPath))); + + } catch (IOException e) { + LOG.error("Exception encountered while testing for preserve status", e); + Assert.fail("Atomic commit failure"); + } finally { + TestDistCpUtils.delete(fs, workPath); + TestDistCpUtils.delete(fs, finalPath); + } + } + + @Test + public void testAtomicCommitExistingFinal() { + TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); + JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), + taskAttemptContext.getTaskAttemptID().getJobID()); + Configuration conf = jobContext.getConfiguration(); + + + String workPath = "/tmp1/" + String.valueOf(rand.nextLong()); + String finalPath = "/tmp1/" + String.valueOf(rand.nextLong()); + FileSystem fs = null; + try { + OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); + fs = FileSystem.get(conf); + fs.mkdirs(new Path(workPath)); + fs.mkdirs(new Path(finalPath)); + + conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, workPath); + conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, finalPath); + conf.setBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, true); + + Assert.assertTrue(fs.exists(new Path(workPath))); + Assert.assertTrue(fs.exists(new Path(finalPath))); + try { + committer.commitJob(jobContext); + Assert.fail("Should not be able to atomic-commit to pre-existing path."); + } catch(Exception exception) { + Assert.assertTrue(fs.exists(new Path(workPath))); + Assert.assertTrue(fs.exists(new Path(finalPath))); + LOG.info("Atomic-commit Test pass."); + } + + } catch (IOException e) { + LOG.error("Exception encountered while testing for atomic commit.", e); + Assert.fail("Atomic commit failure"); + } finally { + TestDistCpUtils.delete(fs, workPath); + TestDistCpUtils.delete(fs, finalPath); + } + } + + private TaskAttemptContext getTaskAttemptContext(Configuration conf) { + return new TaskAttemptContextImpl(conf, + new TaskAttemptID("200707121733", 1, TaskType.MAP, 1, 1)); + } + + private boolean checkDirectoryPermissions(FileSystem fs, String targetBase, + FsPermission sourcePerm) throws IOException { + Path base = new Path(targetBase); + + Stack stack = new Stack(); + stack.push(base); + while (!stack.isEmpty()) { + Path file = stack.pop(); + if (!fs.exists(file)) continue; + FileStatus[] fStatus = fs.listStatus(file); + if (fStatus == null || fStatus.length == 0) continue; + + for (FileStatus status : fStatus) { + if (status.isDirectory()) { + stack.push(status.getPath()); + Assert.assertEquals(status.getPermission(), sourcePerm); + } + } + } + return true; + } + + private static class NullInputFormat extends InputFormat { + @Override + public List getSplits(JobContext context) + throws IOException, InterruptedException { + return Collections.EMPTY_LIST; + } + + @Override + public RecordReader createRecordReader(InputSplit split, + TaskAttemptContext context) + throws IOException, InterruptedException { + return null; + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java new file mode 100644 index 00000000000..e5ab0595c38 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyMapper.java @@ -0,0 +1,826 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.security.AccessControlException; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.tools.DistCpConstants; +import org.apache.hadoop.tools.DistCpOptionSwitch; +import org.apache.hadoop.tools.DistCpOptions; +import org.apache.hadoop.tools.StubContext; +import org.apache.hadoop.tools.util.DistCpUtils; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.security.PrivilegedAction; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +public class TestCopyMapper { + private static final Log LOG = LogFactory.getLog(TestCopyMapper.class); + private static List pathList = new ArrayList(); + private static int nFiles = 0; + private static final int FILE_SIZE = 1024; + + private static MiniDFSCluster cluster; + + private static final String SOURCE_PATH = "/tmp/source"; + private static final String TARGET_PATH = "/tmp/target"; + + private static Configuration configuration; + + @BeforeClass + public static void setup() throws Exception { + configuration = getConfigurationForCluster(); + cluster = new MiniDFSCluster.Builder(configuration) + .numDataNodes(1) + .format(true) + .build(); + } + + private static Configuration getConfigurationForCluster() throws IOException { + Configuration configuration = new Configuration(); + System.setProperty("test.build.data", "target/tmp/build/TEST_COPY_MAPPER/data"); + configuration.set("hadoop.log.dir", "target/tmp"); + LOG.debug("fs.default.name == " + configuration.get("fs.default.name")); + LOG.debug("dfs.http.address == " + configuration.get("dfs.http.address")); + return configuration; + } + + private static Configuration getConfiguration() throws IOException { + Configuration configuration = getConfigurationForCluster(); + final FileSystem fs = cluster.getFileSystem(); + Path workPath = new Path(TARGET_PATH) + .makeQualified(fs.getUri(), fs.getWorkingDirectory()); + configuration.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, + workPath.toString()); + configuration.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, + workPath.toString()); + configuration.setBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(), + false); + configuration.setBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(), + true); + configuration.setBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), + true); + configuration.set(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel(), + "br"); + return configuration; + } + + private static void createSourceData() throws Exception { + mkdirs(SOURCE_PATH + "/1"); + mkdirs(SOURCE_PATH + "/2"); + mkdirs(SOURCE_PATH + "/2/3/4"); + mkdirs(SOURCE_PATH + "/2/3"); + mkdirs(SOURCE_PATH + "/5"); + touchFile(SOURCE_PATH + "/5/6"); + mkdirs(SOURCE_PATH + "/7"); + mkdirs(SOURCE_PATH + "/7/8"); + touchFile(SOURCE_PATH + "/7/8/9"); + } + + private static void mkdirs(String path) throws Exception { + FileSystem fileSystem = cluster.getFileSystem(); + final Path qualifiedPath = new Path(path).makeQualified(fileSystem.getUri(), + fileSystem.getWorkingDirectory()); + pathList.add(qualifiedPath); + fileSystem.mkdirs(qualifiedPath); + } + + private static void touchFile(String path) throws Exception { + FileSystem fs; + DataOutputStream outputStream = null; + try { + fs = cluster.getFileSystem(); + final Path qualifiedPath = new Path(path).makeQualified(fs.getUri(), + fs.getWorkingDirectory()); + final long blockSize = fs.getDefaultBlockSize() * 2; + outputStream = fs.create(qualifiedPath, true, 0, + (short)(fs.getDefaultReplication()*2), + blockSize); + outputStream.write(new byte[FILE_SIZE]); + pathList.add(qualifiedPath); + ++nFiles; + + FileStatus fileStatus = fs.getFileStatus(qualifiedPath); + System.out.println(fileStatus.getBlockSize()); + System.out.println(fileStatus.getReplication()); + } + finally { + IOUtils.cleanup(null, outputStream); + } + } + + @Test + public void testRun() { + try { + deleteState(); + createSourceData(); + + FileSystem fs = cluster.getFileSystem(); + CopyMapper copyMapper = new CopyMapper(); + StubContext stubContext = new StubContext(getConfiguration(), null, 0); + Mapper.Context context + = stubContext.getContext(); + copyMapper.setup(context); + + for (Path path: pathList) { + copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)), + fs.getFileStatus(path), context); + } + + // Check that the maps worked. + for (Path path : pathList) { + final Path targetPath = new Path(path.toString() + .replaceAll(SOURCE_PATH, TARGET_PATH)); + Assert.assertTrue(fs.exists(targetPath)); + Assert.assertTrue(fs.isFile(targetPath) == fs.isFile(path)); + Assert.assertEquals(fs.getFileStatus(path).getReplication(), + fs.getFileStatus(targetPath).getReplication()); + Assert.assertEquals(fs.getFileStatus(path).getBlockSize(), + fs.getFileStatus(targetPath).getBlockSize()); + Assert.assertTrue(!fs.isFile(targetPath) || + fs.getFileChecksum(targetPath).equals( + fs.getFileChecksum(path))); + } + + Assert.assertEquals(pathList.size(), + stubContext.getReporter().getCounter(CopyMapper.Counter.COPY).getValue()); + Assert.assertEquals(nFiles * FILE_SIZE, + stubContext.getReporter().getCounter(CopyMapper.Counter.BYTESCOPIED).getValue()); + + testCopyingExistingFiles(fs, copyMapper, context); + for (Text value : stubContext.getWriter().values()) { + Assert.assertTrue(value.toString() + " is not skipped", value.toString().startsWith("SKIP:")); + } + } + catch (Exception e) { + LOG.error("Unexpected exception: ", e); + Assert.assertTrue(false); + } + } + + private void testCopyingExistingFiles(FileSystem fs, CopyMapper copyMapper, + Mapper.Context context) { + + try { + for (Path path : pathList) { + copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)), + fs.getFileStatus(path), context); + } + + Assert.assertEquals(nFiles, + context.getCounter(CopyMapper.Counter.SKIP).getValue()); + } + catch (Exception exception) { + Assert.assertTrue("Caught unexpected exception:" + exception.getMessage(), + false); + } + } + + @Test + public void testMakeDirFailure() { + try { + deleteState(); + createSourceData(); + + FileSystem fs = cluster.getFileSystem(); + CopyMapper copyMapper = new CopyMapper(); + StubContext stubContext = new StubContext(getConfiguration(), null, 0); + Mapper.Context context + = stubContext.getContext(); + + Configuration configuration = context.getConfiguration(); + String workPath = new Path("hftp://localhost:1234/*/*/*/?/") + .makeQualified(fs.getUri(), fs.getWorkingDirectory()).toString(); + configuration.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, + workPath); + copyMapper.setup(context); + + copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), pathList.get(0))), + fs.getFileStatus(pathList.get(0)), context); + + Assert.assertTrue("There should have been an exception.", false); + } + catch (Exception ignore) { + } + } + + @Test + public void testIgnoreFailures() { + doTestIgnoreFailures(true); + doTestIgnoreFailures(false); + } + + @Test + public void testDirToFile() { + try { + deleteState(); + createSourceData(); + + FileSystem fs = cluster.getFileSystem(); + CopyMapper copyMapper = new CopyMapper(); + StubContext stubContext = new StubContext(getConfiguration(), null, 0); + Mapper.Context context + = stubContext.getContext(); + + mkdirs(SOURCE_PATH + "/src/file"); + touchFile(TARGET_PATH + "/src/file"); + try { + copyMapper.setup(context); + copyMapper.map(new Text("/src/file"), + fs.getFileStatus(new Path(SOURCE_PATH + "/src/file")), + context); + } catch (IOException e) { + Assert.assertTrue(e.getMessage().startsWith("Can't replace")); + } + } catch (Exception e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test failed: " + e.getMessage()); + } + } + + @Test + public void testPreserve() { + try { + deleteState(); + createSourceData(); + + UserGroupInformation tmpUser = UserGroupInformation.createRemoteUser("guest"); + + final CopyMapper copyMapper = new CopyMapper(); + + final Mapper.Context context = tmpUser. + doAs(new PrivilegedAction.Context>() { + @Override + public Mapper.Context run() { + try { + StubContext stubContext = new StubContext(getConfiguration(), null, 0); + return stubContext.getContext(); + } catch (Exception e) { + LOG.error("Exception encountered ", e); + throw new RuntimeException(e); + } + } + }); + + EnumSet preserveStatus = + EnumSet.allOf(DistCpOptions.FileAttribute.class); + + context.getConfiguration().set(DistCpConstants.CONF_LABEL_PRESERVE_STATUS, + DistCpUtils.packAttributes(preserveStatus)); + + touchFile(SOURCE_PATH + "/src/file"); + mkdirs(TARGET_PATH); + cluster.getFileSystem().setPermission(new Path(TARGET_PATH), new FsPermission((short)511)); + + final FileSystem tmpFS = tmpUser.doAs(new PrivilegedAction() { + @Override + public FileSystem run() { + try { + return FileSystem.get(configuration); + } catch (IOException e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test failed: " + e.getMessage()); + throw new RuntimeException("Test ought to fail here"); + } + } + }); + + tmpUser.doAs(new PrivilegedAction() { + @Override + public Integer run() { + try { + copyMapper.setup(context); + copyMapper.map(new Text("/src/file"), + tmpFS.getFileStatus(new Path(SOURCE_PATH + "/src/file")), + context); + Assert.fail("Expected copy to fail"); + } catch (AccessControlException e) { + Assert.assertTrue("Got exception: " + e.getMessage(), true); + } catch (Exception e) { + throw new RuntimeException(e); + } + return null; + } + }); + } catch (Exception e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test failed: " + e.getMessage()); + } + } + + @Test + public void testCopyReadableFiles() { + try { + deleteState(); + createSourceData(); + + UserGroupInformation tmpUser = UserGroupInformation.createRemoteUser("guest"); + + final CopyMapper copyMapper = new CopyMapper(); + + final Mapper.Context context = tmpUser. + doAs(new PrivilegedAction.Context>() { + @Override + public Mapper.Context run() { + try { + StubContext stubContext = new StubContext(getConfiguration(), null, 0); + return stubContext.getContext(); + } catch (Exception e) { + LOG.error("Exception encountered ", e); + throw new RuntimeException(e); + } + } + }); + + touchFile(SOURCE_PATH + "/src/file"); + mkdirs(TARGET_PATH); + cluster.getFileSystem().setPermission(new Path(SOURCE_PATH + "/src/file"), + new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ)); + cluster.getFileSystem().setPermission(new Path(TARGET_PATH), new FsPermission((short)511)); + + final FileSystem tmpFS = tmpUser.doAs(new PrivilegedAction() { + @Override + public FileSystem run() { + try { + return FileSystem.get(configuration); + } catch (IOException e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test failed: " + e.getMessage()); + throw new RuntimeException("Test ought to fail here"); + } + } + }); + + tmpUser.doAs(new PrivilegedAction() { + @Override + public Integer run() { + try { + copyMapper.setup(context); + copyMapper.map(new Text("/src/file"), + tmpFS.getFileStatus(new Path(SOURCE_PATH + "/src/file")), + context); + } catch (Exception e) { + throw new RuntimeException(e); + } + return null; + } + }); + } catch (Exception e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test failed: " + e.getMessage()); + } + } + + @Test + public void testSkipCopyNoPerms() { + try { + deleteState(); + createSourceData(); + + UserGroupInformation tmpUser = UserGroupInformation.createRemoteUser("guest"); + + final CopyMapper copyMapper = new CopyMapper(); + + final StubContext stubContext = tmpUser. + doAs(new PrivilegedAction() { + @Override + public StubContext run() { + try { + return new StubContext(getConfiguration(), null, 0); + } catch (Exception e) { + LOG.error("Exception encountered ", e); + throw new RuntimeException(e); + } + } + }); + + final Mapper.Context context = stubContext.getContext(); + EnumSet preserveStatus = + EnumSet.allOf(DistCpOptions.FileAttribute.class); + + context.getConfiguration().set(DistCpConstants.CONF_LABEL_PRESERVE_STATUS, + DistCpUtils.packAttributes(preserveStatus)); + + touchFile(SOURCE_PATH + "/src/file"); + touchFile(TARGET_PATH + "/src/file"); + cluster.getFileSystem().setPermission(new Path(SOURCE_PATH + "/src/file"), + new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ)); + cluster.getFileSystem().setPermission(new Path(TARGET_PATH + "/src/file"), + new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ)); + + final FileSystem tmpFS = tmpUser.doAs(new PrivilegedAction() { + @Override + public FileSystem run() { + try { + return FileSystem.get(configuration); + } catch (IOException e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test failed: " + e.getMessage()); + throw new RuntimeException("Test ought to fail here"); + } + } + }); + + tmpUser.doAs(new PrivilegedAction() { + @Override + public Integer run() { + try { + copyMapper.setup(context); + copyMapper.map(new Text("/src/file"), + tmpFS.getFileStatus(new Path(SOURCE_PATH + "/src/file")), + context); + Assert.assertEquals(stubContext.getWriter().values().size(), 1); + Assert.assertTrue(stubContext.getWriter().values().get(0).toString().startsWith("SKIP")); + Assert.assertTrue(stubContext.getWriter().values().get(0).toString(). + contains(SOURCE_PATH + "/src/file")); + } catch (Exception e) { + throw new RuntimeException(e); + } + return null; + } + }); + } catch (Exception e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test failed: " + e.getMessage()); + } + } + + @Test + public void testFailCopyWithAccessControlException() { + try { + deleteState(); + createSourceData(); + + UserGroupInformation tmpUser = UserGroupInformation.createRemoteUser("guest"); + + final CopyMapper copyMapper = new CopyMapper(); + + final StubContext stubContext = tmpUser. + doAs(new PrivilegedAction() { + @Override + public StubContext run() { + try { + return new StubContext(getConfiguration(), null, 0); + } catch (Exception e) { + LOG.error("Exception encountered ", e); + throw new RuntimeException(e); + } + } + }); + + EnumSet preserveStatus = + EnumSet.allOf(DistCpOptions.FileAttribute.class); + + final Mapper.Context context + = stubContext.getContext(); + + context.getConfiguration().set(DistCpConstants.CONF_LABEL_PRESERVE_STATUS, + DistCpUtils.packAttributes(preserveStatus)); + + touchFile(SOURCE_PATH + "/src/file"); + OutputStream out = cluster.getFileSystem().create(new Path(TARGET_PATH + "/src/file")); + out.write("hello world".getBytes()); + out.close(); + cluster.getFileSystem().setPermission(new Path(SOURCE_PATH + "/src/file"), + new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ)); + cluster.getFileSystem().setPermission(new Path(TARGET_PATH + "/src/file"), + new FsPermission(FsAction.READ, FsAction.READ, FsAction.READ)); + + final FileSystem tmpFS = tmpUser.doAs(new PrivilegedAction() { + @Override + public FileSystem run() { + try { + return FileSystem.get(configuration); + } catch (IOException e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test failed: " + e.getMessage()); + throw new RuntimeException("Test ought to fail here"); + } + } + }); + + tmpUser.doAs(new PrivilegedAction() { + @Override + public Integer run() { + try { + copyMapper.setup(context); + copyMapper.map(new Text("/src/file"), + tmpFS.getFileStatus(new Path(SOURCE_PATH + "/src/file")), + context); + Assert.fail("Didn't expect the file to be copied"); + } catch (AccessControlException ignore) { + } catch (Exception e) { + if (e.getCause() == null || !(e.getCause() instanceof AccessControlException)) { + throw new RuntimeException(e); + } + } + return null; + } + }); + } catch (Exception e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test failed: " + e.getMessage()); + } + } + + @Test + public void testFileToDir() { + try { + deleteState(); + createSourceData(); + + FileSystem fs = cluster.getFileSystem(); + CopyMapper copyMapper = new CopyMapper(); + StubContext stubContext = new StubContext(getConfiguration(), null, 0); + Mapper.Context context + = stubContext.getContext(); + + touchFile(SOURCE_PATH + "/src/file"); + mkdirs(TARGET_PATH + "/src/file"); + try { + copyMapper.setup(context); + copyMapper.map(new Text("/src/file"), + fs.getFileStatus(new Path(SOURCE_PATH + "/src/file")), + context); + } catch (IOException e) { + Assert.assertTrue(e.getMessage().startsWith("Can't replace")); + } + } catch (Exception e) { + LOG.error("Exception encountered ", e); + Assert.fail("Test failed: " + e.getMessage()); + } + } + + private void doTestIgnoreFailures(boolean ignoreFailures) { + try { + deleteState(); + createSourceData(); + + FileSystem fs = cluster.getFileSystem(); + CopyMapper copyMapper = new CopyMapper(); + StubContext stubContext = new StubContext(getConfiguration(), null, 0); + Mapper.Context context + = stubContext.getContext(); + + Configuration configuration = context.getConfiguration(); + configuration.setBoolean( + DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(),ignoreFailures); + configuration.setBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(), + true); + configuration.setBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(), + true); + copyMapper.setup(context); + + for (Path path : pathList) { + final FileStatus fileStatus = fs.getFileStatus(path); + if (!fileStatus.isDirectory()) { + fs.delete(path, true); + copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)), + fileStatus, context); + } + } + if (ignoreFailures) { + for (Text value : stubContext.getWriter().values()) { + Assert.assertTrue(value.toString() + " is not skipped", value.toString().startsWith("FAIL:")); + } + } + Assert.assertTrue("There should have been an exception.", ignoreFailures); + } + catch (Exception e) { + Assert.assertTrue("Unexpected exception: " + e.getMessage(), + !ignoreFailures); + e.printStackTrace(); + } + } + + private static void deleteState() throws IOException { + pathList.clear(); + nFiles = 0; + cluster.getFileSystem().delete(new Path(SOURCE_PATH), true); + cluster.getFileSystem().delete(new Path(TARGET_PATH), true); + } + + @Test + public void testPreserveBlockSizeAndReplication() { + testPreserveBlockSizeAndReplicationImpl(true); + testPreserveBlockSizeAndReplicationImpl(false); + } + + private void testPreserveBlockSizeAndReplicationImpl(boolean preserve){ + try { + + deleteState(); + createSourceData(); + + FileSystem fs = cluster.getFileSystem(); + CopyMapper copyMapper = new CopyMapper(); + StubContext stubContext = new StubContext(getConfiguration(), null, 0); + Mapper.Context context + = stubContext.getContext(); + + Configuration configuration = context.getConfiguration(); + EnumSet fileAttributes + = EnumSet.noneOf(DistCpOptions.FileAttribute.class); + if (preserve) { + fileAttributes.add(DistCpOptions.FileAttribute.BLOCKSIZE); + fileAttributes.add(DistCpOptions.FileAttribute.REPLICATION); + } + configuration.set(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel(), + DistCpUtils.packAttributes(fileAttributes)); + + copyMapper.setup(context); + + for (Path path : pathList) { + final FileStatus fileStatus = fs.getFileStatus(path); + copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)), + fileStatus, context); + } + + // Check that the block-size/replication aren't preserved. + for (Path path : pathList) { + final Path targetPath = new Path(path.toString() + .replaceAll(SOURCE_PATH, TARGET_PATH)); + final FileStatus source = fs.getFileStatus(path); + final FileStatus target = fs.getFileStatus(targetPath); + if (!source.isDirectory() ) { + Assert.assertTrue(preserve || + source.getBlockSize() != target.getBlockSize()); + Assert.assertTrue(preserve || + source.getReplication() != target.getReplication()); + Assert.assertTrue(!preserve || + source.getBlockSize() == target.getBlockSize()); + Assert.assertTrue(!preserve || + source.getReplication() == target.getReplication()); + } + } + } + catch (Exception e) { + Assert.assertTrue("Unexpected exception: " + e.getMessage(), false); + e.printStackTrace(); + } + } + + private static void changeUserGroup(String user, String group) + throws IOException { + FileSystem fs = cluster.getFileSystem(); + FsPermission changedPermission = new FsPermission( + FsAction.ALL, FsAction.ALL, FsAction.ALL + ); + for (Path path : pathList) + if (fs.isFile(path)) { + fs.setOwner(path, user, group); + fs.setPermission(path, changedPermission); + } + } + + /** + * If a single file is being copied to a location where the file (of the same + * name) already exists, then the file shouldn't be skipped. + */ + @Test + public void testSingleFileCopy() { + try { + deleteState(); + touchFile(SOURCE_PATH + "/1"); + Path sourceFilePath = pathList.get(0); + Path targetFilePath = new Path(sourceFilePath.toString().replaceAll( + SOURCE_PATH, TARGET_PATH)); + touchFile(targetFilePath.toString()); + + FileSystem fs = cluster.getFileSystem(); + CopyMapper copyMapper = new CopyMapper(); + StubContext stubContext = new StubContext(getConfiguration(), null, 0); + Mapper.Context context + = stubContext.getContext(); + + context.getConfiguration().set( + DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, + targetFilePath.getParent().toString()); // Parent directory. + copyMapper.setup(context); + + final FileStatus sourceFileStatus = fs.getFileStatus(sourceFilePath); + + long before = fs.getFileStatus(targetFilePath).getModificationTime(); + copyMapper.map(new Text(DistCpUtils.getRelativePath( + new Path(SOURCE_PATH), sourceFilePath)), sourceFileStatus, context); + long after = fs.getFileStatus(targetFilePath).getModificationTime(); + + Assert.assertTrue("File should have been skipped", before == after); + + context.getConfiguration().set( + DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, + targetFilePath.toString()); // Specify the file path. + copyMapper.setup(context); + + before = fs.getFileStatus(targetFilePath).getModificationTime(); + try { Thread.sleep(2); } catch (Throwable ignore) {} + copyMapper.map(new Text(DistCpUtils.getRelativePath( + new Path(SOURCE_PATH), sourceFilePath)), sourceFileStatus, context); + after = fs.getFileStatus(targetFilePath).getModificationTime(); + + Assert.assertTrue("File should have been overwritten.", before < after); + + } catch (Exception exception) { + Assert.fail("Unexpected exception: " + exception.getMessage()); + exception.printStackTrace(); + } + } + + @Test + public void testPreserveUserGroup() { + testPreserveUserGroupImpl(true); + testPreserveUserGroupImpl(false); + } + + private void testPreserveUserGroupImpl(boolean preserve){ + try { + + deleteState(); + createSourceData(); + changeUserGroup("Michael", "Corleone"); + + FileSystem fs = cluster.getFileSystem(); + CopyMapper copyMapper = new CopyMapper(); + StubContext stubContext = new StubContext(getConfiguration(), null, 0); + Mapper.Context context + = stubContext.getContext(); + + Configuration configuration = context.getConfiguration(); + EnumSet fileAttributes + = EnumSet.noneOf(DistCpOptions.FileAttribute.class); + if (preserve) { + fileAttributes.add(DistCpOptions.FileAttribute.USER); + fileAttributes.add(DistCpOptions.FileAttribute.GROUP); + fileAttributes.add(DistCpOptions.FileAttribute.PERMISSION); + } + + configuration.set(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel(), + DistCpUtils.packAttributes(fileAttributes)); + copyMapper.setup(context); + + for (Path path : pathList) { + final FileStatus fileStatus = fs.getFileStatus(path); + copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)), + fileStatus, context); + } + + // Check that the user/group attributes are preserved + // (only) as necessary. + for (Path path : pathList) { + final Path targetPath = new Path(path.toString() + .replaceAll(SOURCE_PATH, TARGET_PATH)); + final FileStatus source = fs.getFileStatus(path); + final FileStatus target = fs.getFileStatus(targetPath); + if (!source.isDirectory()) { + Assert.assertTrue(!preserve || source.getOwner().equals(target.getOwner())); + Assert.assertTrue(!preserve || source.getGroup().equals(target.getGroup())); + Assert.assertTrue(!preserve || source.getPermission().equals(target.getPermission())); + Assert.assertTrue( preserve || !source.getOwner().equals(target.getOwner())); + Assert.assertTrue( preserve || !source.getGroup().equals(target.getGroup())); + Assert.assertTrue( preserve || !source.getPermission().equals(target.getPermission())); + Assert.assertTrue(source.isDirectory() || + source.getReplication() != target.getReplication()); + } + } + } + catch (Exception e) { + Assert.assertTrue("Unexpected exception: " + e.getMessage(), false); + e.printStackTrace(); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyOutputFormat.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyOutputFormat.java new file mode 100644 index 00000000000..69cec31fcc9 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestCopyOutputFormat.java @@ -0,0 +1,135 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; +import org.apache.hadoop.mapreduce.task.JobContextImpl; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.tools.DistCpConstants; +import org.junit.Test; +import org.junit.Assert; + +import java.io.IOException; + +public class TestCopyOutputFormat { + private static final Log LOG = LogFactory.getLog(TestCopyOutputFormat.class); + + @Test + public void testSetCommitDirectory() { + try { + Job job = Job.getInstance(new Configuration()); + Assert.assertEquals(null, CopyOutputFormat.getCommitDirectory(job)); + + job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, ""); + Assert.assertEquals(null, CopyOutputFormat.getCommitDirectory(job)); + + Path directory = new Path("/tmp/test"); + CopyOutputFormat.setCommitDirectory(job, directory); + Assert.assertEquals(directory, CopyOutputFormat.getCommitDirectory(job)); + Assert.assertEquals(directory.toString(), job.getConfiguration(). + get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH)); + } catch (IOException e) { + LOG.error("Exception encountered while running test", e); + Assert.fail("Failed while testing for set Commit Directory"); + } + } + + @Test + public void testSetWorkingDirectory() { + try { + Job job = Job.getInstance(new Configuration()); + Assert.assertEquals(null, CopyOutputFormat.getWorkingDirectory(job)); + + job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, ""); + Assert.assertEquals(null, CopyOutputFormat.getWorkingDirectory(job)); + + Path directory = new Path("/tmp/test"); + CopyOutputFormat.setWorkingDirectory(job, directory); + Assert.assertEquals(directory, CopyOutputFormat.getWorkingDirectory(job)); + Assert.assertEquals(directory.toString(), job.getConfiguration(). + get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH)); + } catch (IOException e) { + LOG.error("Exception encountered while running test", e); + Assert.fail("Failed while testing for set Working Directory"); + } + } + + @Test + public void testGetOutputCommitter() { + try { + TaskAttemptContext context = new TaskAttemptContextImpl(new Configuration(), + new TaskAttemptID("200707121733", 1, TaskType.MAP, 1, 1)); + context.getConfiguration().set("mapred.output.dir", "/out"); + Assert.assertTrue(new CopyOutputFormat().getOutputCommitter(context) instanceof CopyCommitter); + } catch (IOException e) { + LOG.error("Exception encountered ", e); + Assert.fail("Unable to get output committer"); + } + } + + @Test + public void testCheckOutputSpecs() { + try { + OutputFormat outputFormat = new CopyOutputFormat(); + Job job = Job.getInstance(new Configuration()); + JobID jobID = new JobID("200707121733", 1); + + try { + JobContext context = new JobContextImpl(job.getConfiguration(), jobID); + outputFormat.checkOutputSpecs(context); + Assert.fail("No checking for invalid work/commit path"); + } catch (IllegalStateException ignore) { } + + CopyOutputFormat.setWorkingDirectory(job, new Path("/tmp/work")); + try { + JobContext context = new JobContextImpl(job.getConfiguration(), jobID); + outputFormat.checkOutputSpecs(context); + Assert.fail("No checking for invalid commit path"); + } catch (IllegalStateException ignore) { } + + job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, ""); + CopyOutputFormat.setCommitDirectory(job, new Path("/tmp/commit")); + try { + JobContext context = new JobContextImpl(job.getConfiguration(), jobID); + outputFormat.checkOutputSpecs(context); + Assert.fail("No checking for invalid work path"); + } catch (IllegalStateException ignore) { } + + CopyOutputFormat.setWorkingDirectory(job, new Path("/tmp/work")); + CopyOutputFormat.setCommitDirectory(job, new Path("/tmp/commit")); + try { + JobContext context = new JobContextImpl(job.getConfiguration(), jobID); + outputFormat.checkOutputSpecs(context); + } catch (IllegalStateException ignore) { + Assert.fail("Output spec check failed."); + } + + } catch (IOException e) { + LOG.error("Exception encountered while testing checkoutput specs", e); + Assert.fail("Checkoutput Spec failure"); + } catch (InterruptedException e) { + LOG.error("Exception encountered while testing checkoutput specs", e); + Assert.fail("Checkoutput Spec failure"); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestUniformSizeInputFormat.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestUniformSizeInputFormat.java new file mode 100644 index 00000000000..b9d421a1bac --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/mapred/TestUniformSizeInputFormat.java @@ -0,0 +1,254 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.mapred; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.mapreduce.task.JobContextImpl; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.tools.CopyListing; +import org.apache.hadoop.tools.DistCpOptions; +import org.apache.hadoop.tools.StubContext; +import org.apache.hadoop.security.Credentials; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + + +public class TestUniformSizeInputFormat { + private static final Log LOG + = LogFactory.getLog(TestUniformSizeInputFormat.class); + + private static MiniDFSCluster cluster; + private static final int N_FILES = 20; + private static final int SIZEOF_EACH_FILE=1024; + private static final Random random = new Random(); + private static int totalFileSize = 0; + + private static final Credentials CREDENTIALS = new Credentials(); + + + @BeforeClass + public static void setup() throws Exception { + cluster = new MiniDFSCluster.Builder(new Configuration()).numDataNodes(1) + .format(true).build(); + totalFileSize = 0; + + for (int i=0; i sourceList = new ArrayList(); + sourceList.add(sourcePath); + final DistCpOptions distCpOptions = new DistCpOptions(sourceList, targetPath); + distCpOptions.setMaxMaps(nMaps); + return distCpOptions; + } + + private static int createFile(String path, int fileSize) throws Exception { + FileSystem fileSystem = null; + DataOutputStream outputStream = null; + try { + fileSystem = cluster.getFileSystem(); + outputStream = fileSystem.create(new Path(path), true, 0); + int size = (int) Math.ceil(fileSize + (1 - random.nextFloat()) * fileSize); + outputStream.write(new byte[size]); + return size; + } + finally { + IOUtils.cleanup(null, fileSystem, outputStream); + } + } + + @AfterClass + public static void tearDown() { + cluster.shutdown(); + } + + public void testGetSplits(int nMaps) throws Exception { + DistCpOptions options = getOptions(nMaps); + Configuration configuration = new Configuration(); + configuration.set("mapred.map.tasks", + String.valueOf(options.getMaxMaps())); + Path listFile = new Path(cluster.getFileSystem().getUri().toString() + + "/tmp/testGetSplits_1/fileList.seq"); + CopyListing.getCopyListing(configuration, CREDENTIALS, options). + buildListing(listFile, options); + + JobContext jobContext = new JobContextImpl(configuration, new JobID()); + UniformSizeInputFormat uniformSizeInputFormat = new UniformSizeInputFormat(); + List splits + = uniformSizeInputFormat.getSplits(jobContext); + + List legacySplits = legacyGetSplits(listFile, nMaps); + + int sizePerMap = totalFileSize/nMaps; + + checkSplits(listFile, splits); + checkAgainstLegacy(splits, legacySplits); + + int doubleCheckedTotalSize = 0; + int previousSplitSize = -1; + for (int i=0; i recordReader = uniformSizeInputFormat.createRecordReader( + split, null); + StubContext stubContext = new StubContext(jobContext.getConfiguration(), + recordReader, 0); + final TaskAttemptContext taskAttemptContext + = stubContext.getContext(); + recordReader.initialize(split, taskAttemptContext); + while (recordReader.nextKeyValue()) { + Path sourcePath = recordReader.getCurrentValue().getPath(); + FileSystem fs = sourcePath.getFileSystem(configuration); + FileStatus fileStatus [] = fs.listStatus(sourcePath); + Assert.assertEquals(fileStatus.length, 1); + currentSplitSize += fileStatus[0].getLen(); + } + Assert.assertTrue( + previousSplitSize == -1 + || Math.abs(currentSplitSize - previousSplitSize) < 0.1*sizePerMap + || i == splits.size()-1); + + doubleCheckedTotalSize += currentSplitSize; + } + + Assert.assertEquals(totalFileSize, doubleCheckedTotalSize); + } + + // From + // http://svn.apache.org/repos/asf/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/DistCp.java + private List legacyGetSplits(Path listFile, int numSplits) + throws IOException { + + FileSystem fs = cluster.getFileSystem(); + FileStatus srcst = fs.getFileStatus(listFile); + Configuration conf = fs.getConf(); + + ArrayList splits = new ArrayList(numSplits); + FileStatus value = new FileStatus(); + Text key = new Text(); + final long targetsize = totalFileSize / numSplits; + long pos = 0L; + long last = 0L; + long acc = 0L; + long cbrem = srcst.getLen(); + SequenceFile.Reader sl = null; + + LOG.info("Average bytes per map: " + targetsize + + ", Number of maps: " + numSplits + ", total size: " + totalFileSize); + + try { + sl = new SequenceFile.Reader(conf, SequenceFile.Reader.file(listFile)); + for (; sl.next(key, value); last = sl.getPosition()) { + // if adding this split would put this split past the target size, + // cut the last split and put this next file in the next split. + if (acc + value.getLen() > targetsize && acc != 0) { + long splitsize = last - pos; + FileSplit fileSplit = new FileSplit(listFile, pos, splitsize, null); + LOG.info ("Creating split : " + fileSplit + ", bytes in split: " + splitsize); + splits.add(fileSplit); + cbrem -= splitsize; + pos = last; + acc = 0L; + } + acc += value.getLen(); + } + } + finally { + IOUtils.closeStream(sl); + } + if (cbrem != 0) { + FileSplit fileSplit = new FileSplit(listFile, pos, cbrem, null); + LOG.info ("Creating split : " + fileSplit + ", bytes in split: " + cbrem); + splits.add(fileSplit); + } + + return splits; + } + + private void checkSplits(Path listFile, List splits) throws IOException { + long lastEnd = 0; + + //Verify if each split's start is matching with the previous end and + //we are not missing anything + for (InputSplit split : splits) { + FileSplit fileSplit = (FileSplit) split; + long start = fileSplit.getStart(); + Assert.assertEquals(lastEnd, start); + lastEnd = start + fileSplit.getLength(); + } + + //Verify there is nothing more to read from the input file + SequenceFile.Reader reader + = new SequenceFile.Reader(cluster.getFileSystem().getConf(), + SequenceFile.Reader.file(listFile)); + + try { + reader.seek(lastEnd); + FileStatus srcFileStatus = new FileStatus(); + Text srcRelPath = new Text(); + Assert.assertFalse(reader.next(srcRelPath, srcFileStatus)); + } finally { + IOUtils.closeStream(reader); + } + } + + private void checkAgainstLegacy(List splits, + List legacySplits) + throws IOException, InterruptedException { + + Assert.assertEquals(legacySplits.size(), splits.size()); + for (int index = 0; index < splits.size(); index++) { + FileSplit fileSplit = (FileSplit) splits.get(index); + FileSplit legacyFileSplit = (FileSplit) legacySplits.get(index); + Assert.assertEquals(fileSplit.getStart(), legacyFileSplit.getStart()); + } + } + + @Test + public void testGetSplits() throws Exception { + testGetSplits(9); + for (int i=1; i expectedFilePaths = new ArrayList(N_FILES); + + @BeforeClass + public static void setup() throws Exception { + cluster = new MiniDFSCluster.Builder(getConfigurationForCluster()) + .numDataNodes(1).format(true).build(); + + for (int i=0; i sourceList = new ArrayList(); + sourceList.add(sourcePath); + DistCpOptions options = new DistCpOptions(sourceList, targetPath); + options.setMaxMaps(NUM_SPLITS); + return options; + } + + private static void createFile(String path) throws Exception { + FileSystem fileSystem = null; + DataOutputStream outputStream = null; + try { + fileSystem = cluster.getFileSystem(); + outputStream = fileSystem.create(new Path(path), true, 0); + expectedFilePaths.add(fileSystem.listStatus( + new Path(path))[0].getPath().toString()); + } + finally { + IOUtils.cleanup(null, fileSystem, outputStream); + } + } + + @AfterClass + public static void tearDown() { + cluster.shutdown(); + } + + @Test + public void testGetSplits() throws Exception { + DistCpOptions options = getOptions(); + Configuration configuration = new Configuration(); + configuration.set("mapred.map.tasks", + String.valueOf(options.getMaxMaps())); + CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing( + new Path(cluster.getFileSystem().getUri().toString() + +"/tmp/testDynInputFormat/fileList.seq"), options); + + JobContext jobContext = new JobContextImpl(configuration, new JobID()); + DynamicInputFormat inputFormat = + new DynamicInputFormat(); + List splits = inputFormat.getSplits(jobContext); + + int nFiles = 0; + int taskId = 0; + + for (InputSplit split : splits) { + RecordReader recordReader = + inputFormat.createRecordReader(split, null); + StubContext stubContext = new StubContext(jobContext.getConfiguration(), + recordReader, taskId); + final TaskAttemptContext taskAttemptContext + = stubContext.getContext(); + + recordReader.initialize(splits.get(0), taskAttemptContext); + float previousProgressValue = 0f; + while (recordReader.nextKeyValue()) { + FileStatus fileStatus = recordReader.getCurrentValue(); + String source = fileStatus.getPath().toString(); + System.out.println(source); + Assert.assertTrue(expectedFilePaths.contains(source)); + final float progress = recordReader.getProgress(); + Assert.assertTrue(progress >= previousProgressValue); + Assert.assertTrue(progress >= 0.0f); + Assert.assertTrue(progress <= 1.0f); + previousProgressValue = progress; + ++nFiles; + } + Assert.assertTrue(recordReader.getProgress() == 1.0f); + + ++taskId; + } + + Assert.assertEquals(expectedFilePaths.size(), nFiles); + } + + @Test + public void testGetSplitRatio() throws Exception { + Assert.assertEquals(1, DynamicInputFormat.getSplitRatio(1, 1000000000)); + Assert.assertEquals(2, DynamicInputFormat.getSplitRatio(11000000, 10)); + Assert.assertEquals(4, DynamicInputFormat.getSplitRatio(30, 700)); + Assert.assertEquals(2, DynamicInputFormat.getSplitRatio(30, 200)); + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestDistCpUtils.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestDistCpUtils.java new file mode 100644 index 00000000000..762979287dd --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestDistCpUtils.java @@ -0,0 +1,220 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.util; + +import org.apache.hadoop.tools.DistCpOptions.FileAttribute; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.junit.Assert; +import org.junit.Test; +import org.junit.BeforeClass; +import org.junit.AfterClass; + +import java.util.EnumSet; +import java.util.Random; +import java.util.Stack; +import java.io.IOException; +import java.io.OutputStream; + +public class TestDistCpUtils { + private static final Log LOG = LogFactory.getLog(TestDistCpUtils.class); + + private static final Configuration config = new Configuration(); + private static MiniDFSCluster cluster; + + @BeforeClass + public static void create() throws IOException { + cluster = new MiniDFSCluster.Builder(config).numDataNodes(1).format(true) + .build(); + } + + @AfterClass + public static void destroy() { + if (cluster != null) { + cluster.shutdown(); + } + } + + @Test + public void testGetRelativePathRoot() { + Path root = new Path("/tmp/abc"); + Path child = new Path("/tmp/abc/xyz/file"); + Assert.assertEquals(DistCpUtils.getRelativePath(root, child), "/xyz/file"); + + root = new Path("/"); + child = new Path("/a"); + Assert.assertEquals(DistCpUtils.getRelativePath(root, child), "/a"); + } + + @Test + public void testPackAttributes() { + EnumSet attributes = EnumSet.noneOf(FileAttribute.class); + Assert.assertEquals(DistCpUtils.packAttributes(attributes), ""); + + attributes.add(FileAttribute.REPLICATION); + Assert.assertEquals(DistCpUtils.packAttributes(attributes), "R"); + Assert.assertEquals(attributes, DistCpUtils.unpackAttributes("R")); + + attributes.add(FileAttribute.BLOCKSIZE); + Assert.assertEquals(DistCpUtils.packAttributes(attributes), "RB"); + Assert.assertEquals(attributes, DistCpUtils.unpackAttributes("RB")); + + attributes.add(FileAttribute.USER); + Assert.assertEquals(DistCpUtils.packAttributes(attributes), "RBU"); + Assert.assertEquals(attributes, DistCpUtils.unpackAttributes("RBU")); + + attributes.add(FileAttribute.GROUP); + Assert.assertEquals(DistCpUtils.packAttributes(attributes), "RBUG"); + Assert.assertEquals(attributes, DistCpUtils.unpackAttributes("RBUG")); + + attributes.add(FileAttribute.PERMISSION); + Assert.assertEquals(DistCpUtils.packAttributes(attributes), "RBUGP"); + Assert.assertEquals(attributes, DistCpUtils.unpackAttributes("RBUGP")); + } + + @Test + public void testPreserve() { + try { + FileSystem fs = FileSystem.get(config); + EnumSet attributes = EnumSet.noneOf(FileAttribute.class); + + + Path path = new Path("/tmp/abc"); + Path src = new Path("/tmp/src"); + fs.mkdirs(path); + fs.mkdirs(src); + FileStatus srcStatus = fs.getFileStatus(src); + + FsPermission noPerm = new FsPermission((short) 0); + fs.setPermission(path, noPerm); + fs.setOwner(path, "nobody", "nobody"); + + DistCpUtils.preserve(fs, path, srcStatus, attributes); + FileStatus target = fs.getFileStatus(path); + Assert.assertEquals(target.getPermission(), noPerm); + Assert.assertEquals(target.getOwner(), "nobody"); + Assert.assertEquals(target.getGroup(), "nobody"); + + attributes.add(FileAttribute.PERMISSION); + DistCpUtils.preserve(fs, path, srcStatus, attributes); + target = fs.getFileStatus(path); + Assert.assertEquals(target.getPermission(), srcStatus.getPermission()); + Assert.assertEquals(target.getOwner(), "nobody"); + Assert.assertEquals(target.getGroup(), "nobody"); + + attributes.add(FileAttribute.GROUP); + attributes.add(FileAttribute.USER); + DistCpUtils.preserve(fs, path, srcStatus, attributes); + target = fs.getFileStatus(path); + Assert.assertEquals(target.getPermission(), srcStatus.getPermission()); + Assert.assertEquals(target.getOwner(), srcStatus.getOwner()); + Assert.assertEquals(target.getGroup(), srcStatus.getGroup()); + + fs.delete(path, true); + fs.delete(src, true); + } catch (IOException e) { + LOG.error("Exception encountered ", e); + Assert.fail("Preserve test failure"); + } + } + + private static Random rand = new Random(); + + public static String createTestSetup(FileSystem fs) throws IOException { + return createTestSetup("/tmp1", fs, FsPermission.getDefault()); + } + + public static String createTestSetup(FileSystem fs, + FsPermission perm) throws IOException { + return createTestSetup("/tmp1", fs, perm); + } + + public static String createTestSetup(String baseDir, + FileSystem fs, + FsPermission perm) throws IOException { + String base = getBase(baseDir); + fs.mkdirs(new Path(base + "/newTest/hello/world1")); + fs.mkdirs(new Path(base + "/newTest/hello/world2/newworld")); + fs.mkdirs(new Path(base + "/newTest/hello/world3/oldworld")); + fs.setPermission(new Path(base + "/newTest"), perm); + fs.setPermission(new Path(base + "/newTest/hello"), perm); + fs.setPermission(new Path(base + "/newTest/hello/world1"), perm); + fs.setPermission(new Path(base + "/newTest/hello/world2"), perm); + fs.setPermission(new Path(base + "/newTest/hello/world2/newworld"), perm); + fs.setPermission(new Path(base + "/newTest/hello/world3"), perm); + fs.setPermission(new Path(base + "/newTest/hello/world3/oldworld"), perm); + createFile(fs, base + "/newTest/1"); + createFile(fs, base + "/newTest/hello/2"); + createFile(fs, base + "/newTest/hello/world3/oldworld/3"); + createFile(fs, base + "/newTest/hello/world2/4"); + return base; + } + + private static String getBase(String base) { + String location = String.valueOf(rand.nextLong()); + return base + "/" + location; + } + + public static void delete(FileSystem fs, String path) { + try { + if (fs != null) { + if (path != null) { + fs.delete(new Path(path), true); + } + } + } catch (IOException e) { + LOG.warn("Exception encountered ", e); + } + } + + public static void createFile(FileSystem fs, String filePath) throws IOException { + OutputStream out = fs.create(new Path(filePath)); + IOUtils.closeStream(out); + } + + public static boolean checkIfFoldersAreInSync(FileSystem fs, String targetBase, String sourceBase) + throws IOException { + Path base = new Path(targetBase); + + Stack stack = new Stack(); + stack.push(base); + while (!stack.isEmpty()) { + Path file = stack.pop(); + if (!fs.exists(file)) continue; + FileStatus[] fStatus = fs.listStatus(file); + if (fStatus == null || fStatus.length == 0) continue; + + for (FileStatus status : fStatus) { + if (status.isDirectory()) { + stack.push(status.getPath()); + } + Assert.assertTrue(fs.exists(new Path(sourceBase + "/" + + DistCpUtils.getRelativePath(new Path(targetBase), status.getPath())))); + } + } + return true; + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestRetriableCommand.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestRetriableCommand.java new file mode 100644 index 00000000000..a4c2d8d3db5 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestRetriableCommand.java @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.util; + +import org.apache.hadoop.io.retry.RetryPolicy; +import org.apache.hadoop.io.retry.RetryPolicies; +import org.junit.Assert; +import org.junit.Test; + +import java.util.concurrent.TimeUnit; + +public class TestRetriableCommand { + + private static class MyRetriableCommand extends RetriableCommand { + + private int succeedAfter; + private int retryCount = 0; + + public MyRetriableCommand(int succeedAfter) { + super("MyRetriableCommand"); + this.succeedAfter = succeedAfter; + } + + public MyRetriableCommand(int succeedAfter, RetryPolicy retryPolicy) { + super("MyRetriableCommand", retryPolicy); + this.succeedAfter = succeedAfter; + } + + @Override + protected Object doExecute(Object... arguments) throws Exception { + if (++retryCount < succeedAfter) + throw new Exception("Transient failure#" + retryCount); + return 0; + } + } + + @Test + public void testRetriableCommand() { + try { + new MyRetriableCommand(5).execute(0); + Assert.assertTrue(false); + } + catch (Exception e) { + Assert.assertTrue(true); + } + + + try { + new MyRetriableCommand(3).execute(0); + Assert.assertTrue(true); + } + catch (Exception e) { + Assert.assertTrue(false); + } + + try { + new MyRetriableCommand(5, RetryPolicies. + retryUpToMaximumCountWithFixedSleep(5, 0, TimeUnit.MILLISECONDS)).execute(0); + Assert.assertTrue(true); + } + catch (Exception e) { + Assert.assertTrue(false); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestThrottledInputStream.java b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestThrottledInputStream.java new file mode 100644 index 00000000000..0ac89980c58 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/util/TestThrottledInputStream.java @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.tools.util; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.IOUtils; +import org.junit.Assert; +import org.junit.Test; + +import java.io.*; + +public class TestThrottledInputStream { + private static final Log LOG = LogFactory.getLog(TestThrottledInputStream.class); + private static final int BUFF_SIZE = 1024; + + private enum CB {ONE_C, BUFFER, BUFF_OFFSET} + + @Test + public void testRead() { + File tmpFile; + File outFile; + try { + tmpFile = createFile(1024); + outFile = createFile(); + + tmpFile.deleteOnExit(); + outFile.deleteOnExit(); + + long maxBandwidth = copyAndAssert(tmpFile, outFile, 0, 1, -1, CB.BUFFER); + + copyAndAssert(tmpFile, outFile, maxBandwidth, 20, 0, CB.BUFFER); +/* + copyAndAssert(tmpFile, outFile, maxBandwidth, 10, 0, CB.BUFFER); + copyAndAssert(tmpFile, outFile, maxBandwidth, 50, 0, CB.BUFFER); +*/ + + copyAndAssert(tmpFile, outFile, maxBandwidth, 20, 0, CB.BUFF_OFFSET); +/* + copyAndAssert(tmpFile, outFile, maxBandwidth, 10, 0, CB.BUFF_OFFSET); + copyAndAssert(tmpFile, outFile, maxBandwidth, 50, 0, CB.BUFF_OFFSET); +*/ + + copyAndAssert(tmpFile, outFile, maxBandwidth, 20, 0, CB.ONE_C); +/* + copyAndAssert(tmpFile, outFile, maxBandwidth, 10, 0, CB.ONE_C); + copyAndAssert(tmpFile, outFile, maxBandwidth, 50, 0, CB.ONE_C); +*/ + } catch (IOException e) { + LOG.error("Exception encountered ", e); + } + } + + private long copyAndAssert(File tmpFile, File outFile, + long maxBandwidth, float factor, + int sleepTime, CB flag) throws IOException { + long bandwidth; + ThrottledInputStream in; + long maxBPS = (long) (maxBandwidth / factor); + + if (maxBandwidth == 0) { + in = new ThrottledInputStream(new FileInputStream(tmpFile)); + } else { + in = new ThrottledInputStream(new FileInputStream(tmpFile), maxBPS); + } + OutputStream out = new FileOutputStream(outFile); + try { + if (flag == CB.BUFFER) { + copyBytes(in, out, BUFF_SIZE); + } else if (flag == CB.BUFF_OFFSET){ + copyBytesWithOffset(in, out, BUFF_SIZE); + } else { + copyByteByByte(in, out); + } + + LOG.info(in); + bandwidth = in.getBytesPerSec(); + Assert.assertEquals(in.getTotalBytesRead(), tmpFile.length()); + Assert.assertTrue(in.getBytesPerSec() > maxBandwidth / (factor * 1.2)); + Assert.assertTrue(in.getTotalSleepTime() > sleepTime || in.getBytesPerSec() <= maxBPS); + } finally { + IOUtils.closeStream(in); + IOUtils.closeStream(out); + } + return bandwidth; + } + + private static void copyBytesWithOffset(InputStream in, OutputStream out, int buffSize) + throws IOException { + + byte buf[] = new byte[buffSize]; + int bytesRead = in.read(buf, 0, buffSize); + while (bytesRead >= 0) { + out.write(buf, 0, bytesRead); + bytesRead = in.read(buf); + } + } + + private static void copyByteByByte(InputStream in, OutputStream out) + throws IOException { + + int ch = in.read(); + while (ch >= 0) { + out.write(ch); + ch = in.read(); + } + } + + private static void copyBytes(InputStream in, OutputStream out, int buffSize) + throws IOException { + + byte buf[] = new byte[buffSize]; + int bytesRead = in.read(buf); + while (bytesRead >= 0) { + out.write(buf, 0, bytesRead); + bytesRead = in.read(buf); + } + } + + private File createFile(long sizeInKB) throws IOException { + File tmpFile = createFile(); + writeToFile(tmpFile, sizeInKB); + return tmpFile; + } + + private File createFile() throws IOException { + return File.createTempFile("tmp", "dat"); + } + + private void writeToFile(File tmpFile, long sizeInKB) throws IOException { + OutputStream out = new FileOutputStream(tmpFile); + try { + byte[] buffer = new byte [1024]; + for (long index = 0; index < sizeInKB; index++) { + out.write(buffer); + } + } finally { + IOUtils.closeStream(out); + } + } +} diff --git a/hadoop-tools/hadoop-distcp/src/test/resources/sslConfig.xml b/hadoop-tools/hadoop-distcp/src/test/resources/sslConfig.xml new file mode 100644 index 00000000000..016edf27ed1 --- /dev/null +++ b/hadoop-tools/hadoop-distcp/src/test/resources/sslConfig.xml @@ -0,0 +1,57 @@ + + + + + + + ssl.client.truststore.location + /path/to/truststore/keys/keystore.jks + Truststore to be used by clients like distcp. Must be + specified. + + + + + ssl.client.truststore.password + changeit + Optional. Default value is "". + + + + + ssl.client.truststore.type + jks + Optional. Default value is "jks". + + + + + ssl.client.keystore.location + /path/to/keystore/keys/keystore.jks + Keystore to be used by clients like distcp. Must be + specified. + + + + + ssl.client.keystore.password + changeit + Optional. Default value is "". + + + + + ssl.client.keystore.keypassword + changeit + Optional. Default value is "". + + + + + ssl.client.keystore.type + jks + Optional. Default value is "jks". + + + + diff --git a/hadoop-tools/pom.xml b/hadoop-tools/pom.xml index b95798988d9..6ef29a7edfc 100644 --- a/hadoop-tools/pom.xml +++ b/hadoop-tools/pom.xml @@ -29,6 +29,7 @@ hadoop-streaming + hadoop-distcp hadoop-archives hadoop-rumen hadoop-tools-dist